Esempio n. 1
0
def DL_Distance(str1, str2):
    print(str1, str2)
    print("distance 1: ", distance.nlevenshtein(str1, str2))
    print("distance 2: ", damerau_levenshtein_distance(str1, str2))
    dls = (damerau_levenshtein_distance(str1, str2) /
           max(len(str1), len(str2)))
    print("distance 3: ", dls)

    print("distance 4: ", distance.jaccard(str1, str2))
Esempio n. 2
0
def dameraulevenshtein(seq1, seq2):
    """Calculate the Damerau-Levenshtein distance between sequences.

    This distance is the number of operations (consisting of insertions,
    deletions or substitutions of a single character, or transposition of two
    adjacent characters) required to change one sequence into the other.

    Arguments may be str or unicode.

    >>> dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2
    >>> dameraulevenshtein(u'abcd', u'bacde')
    2
    >>> dameraulevenshtein(u'number e', u'number \u03c0')
    1
    """
    if isinstance(seq1, str):
        seq1 = unicode(seq1, 'utf-8')
    if isinstance(seq2, str):
        seq2 = unicode(seq2, 'utf-8')

    # Fall back onto Python implementation for code points unsupported by the C
    # implementation.
    # https://github.com/jamesturk/jellyfish/issues/55#issuecomment-312509263
    try:
        return jellyfish.damerau_levenshtein_distance(seq1, seq2)
    except ValueError:
        return py_jellyfish.damerau_levenshtein_distance(seq1, seq2)
Esempio n. 3
0
 def mapperSimilarity(self, _, line):
     SIMILARITY_THRESHOLD = 0.8
     words = line.split(';')
     distance = damerau_levenshtein_distance(words[0], words[1])
     sim = self.normalizeDistanceIndex(len(words[0]), len(words[1]),
                                       distance)
     if (sim > SIMILARITY_THRESHOLD):
         yield (words[0], [words[1], sim])
Esempio n. 4
0
def match(data1, data2, fields1, fields2):
    threshold = 0.4
    matches = []
    for data1key, data1values in data1.items():
        for data2key, data2values in data2.items():
            match = True
            for field1, field2 in zip(fields1, fields2):
                maximum = float(max(len(data1values[field1]), len(data2values[field2])))
                if jellyfish.damerau_levenshtein_distance(data1values[field1], data2values[field2]) / maximum > threshold: match = False
            if match: matches.append((data1key, data2key))
    return matches
Esempio n. 5
0
def match(data1, data2, fields1, fields2):
    threshold = 0.6
    matches = []
    for data1key, data1values in data1.items():
        for data2key, data2values in data2.items():
            match = False
            for field1, field2 in zip(fields1, fields2):
                maximum = float(
                    max(len(data1values[field1]), len(data2values[field2])))
                distance = jellyfish.damerau_levenshtein_distance(
                    data1values[field1], data2values[field2])
                degree = 1 - distance / maximum
                if degree > threshold: match = True
            if match: matches.append((data1key, data2key, degree))
    return matches
Esempio n. 6
0
 def mapperSimilarity(self, key, data):
     if key == "dl":
         SIMILARITY_THRESHOLD = 0.8
         distance = damerau_levenshtein_distance(data[0], data[1])
         sim = self.normalizeDistanceIndex(len(data[0]), len(data[1]),
                                           distance)
         if (sim > SIMILARITY_THRESHOLD):
             yield ("ap", [data[0], data[1], sim])
             yield ("cos", [data[0], data[1], sim])
             yield ("jw", [data[0], data[1], sim])
     elif key == "rat":
         SIMILARITY_THRESHOLD = 0.8
         sim = SequenceMatcher(a=data[0], b=data[1]).ratio()
         if (sim > SIMILARITY_THRESHOLD):
             yield ("ap", [data[0], data[1], sim])
             yield ("cos", [data[0], data[1], sim])
             yield ("jw", [data[0], data[1], sim])
Esempio n. 7
0
def levenshtein(s1, s2):
    if (not s1):
        s1 = ""
    if (not s2):
        s2 = ""
    if len(s1) < len(s2):
        return levenshtein(s2, s1)
    #choosen
    if len(s2) == 0:
        return len(s1)

    try:
        return jellyfish.damerau_levenshtein_distance(str(s1), str(s2))
    except:
        # workaround for unicode : fallback from c to python version
        return py_jellyfish.damerau_levenshtein_distance(str(s1), str(s2))
    # cached version
    try:
        return levCache[tuple(s1, s2)]
    except:
        pass

    levCache[tuple([s1,
                    s2])] = jellyfish.levenshtein_distance(str(s1), str(s2))
    return levCache[tuple([s1, s2])]

    #original
    # len(s1) >= len(s2)

    previous_row = list(range(len(s2) + 1))
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[
                j +
                1] + 1  # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1  # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]
Esempio n. 8
0
def runExperiments(logIdentificator, formulaType):
    eventlog, path_to_model_file, beam_size, \
        prefix_size_pred_from, prefix_size_pred_to, formula = activateSettings(logIdentificator, formulaType)
    start_time = time.time()

    lines, lines_t, lines_t2, lines_t3, maxlen, chars, char_indices,divisor, divisor2, \
        divisor3, predict_size,target_indices_char,target_char_indices\
        = prepare_testing_data(eventlog)

    #
    # lines = lines[0:300]
    # lines_t= lines_t[0:300]
    # lines_t2=lines_t2[0:300]
    # lines_t3=lines_t3[0:300]

    #this is the beam stack size, means how many "best" alternatives will be stored
    one_ahead_gt = []
    one_ahead_pred = []

    #find cycles and modify the probability functionality goes here
    stop_symbol_probability_amplifier_current = 1

    # load model, set this to the model generated by train.py
    model = load_model(path_to_model_file)

    class NodePrediction():
        def __init__(self,
                     data,
                     cropped_line,
                     total_predicted_time,
                     probability_of=0):
            self.data = data
            self.cropped_line = cropped_line
            self.total_predicted_time = total_predicted_time
            self.probability_of = probability_of

    # make predictions
    with open(
            'output_files/results/' + formulaType +
            '/suffix_and_remaining_time3_%s' % eventlog, 'wb') as csvfile:
        spamwriter = csv.writer(csvfile,
                                delimiter=',',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow([
            "Prefix length", "Groud truth", "Predicted", "Levenshtein",
            "Damerau", "Jaccard", "Ground truth times", "Predicted times",
            "RMSE", "MAE", "Median AE"
        ])
        for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to):
            print(prefix_size)

            # lines = lines[13:]
            # lines_t = lines_t[13:]
            # lines_t2 = lines_t2[13:]
            # lines_t3 = lines_t3[13:]
            lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces(
                lines, lines_t, lines_t2, lines_t3, formula, prefix_size)
            print("prefix size: " + str(prefix_size))
            print("formulas verifited: " + str(len(lines_s)) + " out of : " +
                  str(len(lines)))
            counterr = 0
            for line, times, times2, times3 in izip(lines_s, lines_t_s,
                                                    lines_t2_s, lines_t3_s):
                times.append(0)
                cropped_line = ''.join(line[:prefix_size])
                cropped_times = times[:prefix_size]
                cropped_times3 = times3[:prefix_size]
                if len(times2) < prefix_size:
                    continue  # make no prediction for this case, since this case has ended already

                # initialize root of the tree for beam search
                total_predicted_time_initialization = 0
                search_node_root = NodePrediction(
                    encode(cropped_line, cropped_times, cropped_times3, maxlen,
                           chars, char_indices, divisor, divisor2),
                    cropped_line, total_predicted_time_initialization)

                ground_truth = ''.join(line[prefix_size:prefix_size +
                                            predict_size])
                ground_truth_t = times2[prefix_size - 1]
                case_end_time = times2[len(times2) - 1]
                ground_truth_t = case_end_time - ground_truth_t
                predicted = ''

                queue_next_steps = PriorityQueue()
                queue_next_steps.put(
                    (-search_node_root.probability_of, search_node_root))

                queue_next_steps_future = PriorityQueue()
                start_of_the_cycle_symbol = " "
                found_sattisfying_constraint = False

                current_beam_size = beam_size

                for i in range(predict_size):
                    for k in range(current_beam_size):
                        if queue_next_steps.empty():
                            break

                        _, current_prediction_premis = queue_next_steps.get()

                        if not found_sattisfying_constraint:
                            if verify_formula_as_compliant(
                                    current_prediction_premis.cropped_line,
                                    formula, prefix_size):
                                #the formula verified and we can just finish the predictions
                                #beam size is 1 because predict only sequence of events
                                current_beam_size = 1
                                #overwrite new queue
                                queue_next_steps_future = PriorityQueue()
                                found_sattisfying_constraint = True

                        enc = current_prediction_premis.data
                        temp_cropped_line = current_prediction_premis.cropped_line
                        y = model.predict(enc, verbose=0)  # make predictions
                        # split predictions into seperate activity and time predictions
                        y_char = y[0][0]
                        y_t = y[1][0][0]

                        if y_t < 0:
                            y_t = 0
                        cropped_times.append(y_t)

                        if not i == 0:
                            stop_symbol_probability_amplifier_current, start_of_the_cycle_symbol = amplify(
                                temp_cropped_line)

                        #in not reached, function :choose_next_top_descendant: will backtrack
                        y_t = y_t * divisor3
                        cropped_times3.append(cropped_times3[-1] +
                                              timedelta(seconds=y_t))

                        for j in range(current_beam_size):
                            temp_prediction = getSymbolAmpl(
                                y_char, target_indices_char,
                                target_char_indices, start_of_the_cycle_symbol,
                                stop_symbol_probability_amplifier_current, j)

                            if temp_prediction == '!':  # end of case was just predicted, therefore, stop predicting further into the future
                                if verify_formula_as_compliant(
                                        temp_cropped_line, formula,
                                        prefix_size):
                                    one_ahead_pred.append(
                                        current_prediction_premis.
                                        total_predicted_time)
                                    one_ahead_gt.append(ground_truth_t)
                                    stop_symbol_probability_amplifier_current = 1
                                    print('! predicted, end case')
                                    queue_next_steps = PriorityQueue()
                                    break
                                else:
                                    continue

                            temp_cropped_line = current_prediction_premis.cropped_line + temp_prediction
                            temp_total_predicted_time = current_prediction_premis.total_predicted_time + y_t
                            temp_state_data = encode(temp_cropped_line,
                                                     cropped_times,
                                                     cropped_times3, maxlen,
                                                     chars, char_indices,
                                                     divisor, divisor2)
                            probability_this = np.sort(y_char)[len(y_char) -
                                                               1 - j]

                            temp = NodePrediction(
                                temp_state_data, temp_cropped_line,
                                temp_total_predicted_time,
                                current_prediction_premis.probability_of +
                                np.log(probability_this))
                            queue_next_steps_future.put(
                                (-temp.probability_of, temp))
                            # print str(counterr) + ' ' + str(i) + ' ' + str(k) \
                            #       + ' ' + str(j) + ' ' + temp_cropped_line[prefix_size:]\
                            #       + "     " + str(temp.probability_of)
                    queue_next_steps = queue_next_steps_future
                    queue_next_steps_future = PriorityQueue()

                counterr += 1

                if current_prediction_premis == None:
                    print "Cannot find any trace that is compliant with formula given current beam size"
                    break

                output = []

                if current_prediction_premis == None:
                    predicted = u""
                    total_predicted_time = 0
                else:
                    predicted = (
                        current_prediction_premis.cropped_line[prefix_size:])
                    total_predicted_time = current_prediction_premis.total_predicted_time

                if len(ground_truth) > 0:
                    output.append(prefix_size)
                    output.append(unicode(ground_truth).encode("utf-8"))
                    output.append(unicode(predicted).encode("utf-8"))
                    output.append(
                        1 - distance.nlevenshtein(predicted, ground_truth))
                    dls = 1 - (damerau_levenshtein_distance(
                        unicode(predicted), unicode(ground_truth)) /
                               max(len(predicted), len(ground_truth)))
                    if dls < 0:
                        dls = 0  # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case
                    output.append(dls)
                    output.append(1 -
                                  distance.jaccard(predicted, ground_truth))
                    output.append(ground_truth_t)
                    output.append(total_predicted_time)
                    output.append('')
                    output.append(
                        metrics.mean_absolute_error([ground_truth_t],
                                                    [total_predicted_time]))
                    output.append(
                        metrics.median_absolute_error([ground_truth_t],
                                                      [total_predicted_time]))
                    spamwriter.writerow(output)

    print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
Esempio n. 9
0
def test_damerau_levenshtein_distance(jf, s1, s2, value):
    value = int(value)
    assert jf.damerau_levenshtein_distance(s1, s2) == value
Esempio n. 10
0
def test_damerau_levenshtein_distance_type(jf):
    jf.damerau_levenshtein_distance(u'abc', u'abc')
    with pytest.raises(TypeError) as exc:
        jf.damerau_levenshtein_distance(b'abc', b'abc')
    assert 'expected' in str(exc.value)
Esempio n. 11
0
def test(args, preprocess_manager):
    result_dir = args.result_dir
    task = args.task

    # get test set
    if preprocess_manager.num_features_additional > 0:
        lines, caseids, lines_t, lines_t2, lines_t3, lines_add, sequence_max_length, num_features_all, num_features_activities = preprocess_manager.create_test_set(
        )
    else:
        lines, caseids, lines_t, lines_t2, lines_t3, sequence_max_length, num_features_all, num_features_activities = preprocess_manager.create_test_set(
        )

    # load model
    model_suffix_prediction = load_model(
        '%smodel_suffix_prediction_%s.h5' %
        (args.checkpoint_dir, preprocess_manager.iteration_cross_validation))

    # set options for result output
    data_set_name = args.data_set.split('.csv')[0]
    generic_result_dir = result_dir + data_set_name + "__" + task
    fold_result_dir = generic_result_dir + "_%d%s" % (
        preprocess_manager.iteration_cross_validation, ".csv")
    result_dir = fold_result_dir

    # start prediction
    with open(result_dir, 'w') as csvfile:
        spamwriter = csv.writer(csvfile,
                                delimiter=';',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow([
            "CaseID", "Prefix length", "Ground truth", "Predicted",
            "Levenshtein", "Damerau", "Jaccard", "Ground truth times",
            "Predicted times", "MAE", "In time", "Dev. in time",
            "Num corrections"
        ])

        for line, caseid, times, times2, times3, line_add in zip(
                lines, caseids, lines_t, lines_t2, lines_t3, lines_add):

            # for each prefix of a case with a size > 1
            for prefix_size in range(2, sequence_max_length):

                num_corrections = 0

                util.llprint("\nPrefix size: %d\n" % prefix_size)

                # preparation for next best event determination
                # get prefix; one output for each prefix of a case
                current = dict()
                predict = dict()
                ground_truth = dict()

                # current = ground truth prefix + predicted suffix
                current = {
                    "line": ''.join(line[:prefix_size]),
                    "times": times[:prefix_size],
                    "times2": times2[:prefix_size],
                    "times3": times3[:prefix_size],
                    "line_add": line_add[:prefix_size],
                }

                # termination
                if '!' in current["line"]:
                    break

                ground_truth = {
                    "total_event":
                    ''.join(line[:]),
                    "prefix_event":
                    ''.join(line[:prefix_size]),
                    "suffix_event":
                    ''.join(line[prefix_size:]),
                    "total_time":
                    times2[len(times2) - 1],
                    "prefix_time":
                    times2[prefix_size - 1],
                    "suffix_time":
                    times2[len(times2) - 1] - times2[prefix_size - 1]
                }

                predict = {
                    "size": sequence_max_length - 1,
                    "predicted": '',
                    "suffix_time": 0
                }

                # result for each prefix of a case
                if args.next_best_action:

                    # check prefix conformance
                    if preprocess_manager.checkCandidate(
                            args,
                            preprocess_manager.transformNewInstance(
                                ground_truth["prefix_event"])):

                        predict, in_time, deviation_in_time, num_corrections = predictSuffixAndTimeForPrefixNextBestEvent(
                            args, model_suffix_prediction, preprocess_manager,
                            current, predict, ground_truth, num_corrections)
                    else:
                        break

                else:
                    predict, in_time, deviation_in_time = predictSuffixAndTimeForPrefix(
                        args, model_suffix_prediction, preprocess_manager,
                        current, predict, ground_truth)

                # termination
                if predict["predicted"] == "":
                    continue

                output = []
                if len(ground_truth["suffix_event"]) > 0:

                    output.append(caseid)
                    output.append(prefix_size)
                    output.append(
                        str(ground_truth["suffix_event"]).encode("utf-8"))
                    output.append(str(predict["predicted"]).encode("utf-8"))
                    output.append(1 - distance.nlevenshtein(
                        predict["predicted"], ground_truth["suffix_event"]))

                    dls = 1 - (damerau_levenshtein_distance(
                        str(predict["predicted"]),
                        str(ground_truth["suffix_event"])) /
                               max(len(predict["predicted"]),
                                   len(ground_truth["suffix_event"])))
                    if dls < 0:
                        dls = 0
                    output.append(dls)
                    output.append(1 - distance.jaccard(
                        predict["predicted"], ground_truth["suffix_event"]))
                    output.append(ground_truth["suffix_time"])
                    output.append(predict["suffix_time"])
                    output.append(
                        metrics.mean_absolute_error(
                            [ground_truth["suffix_time"]],
                            [predict["suffix_time"]]))
                    output.append(in_time)
                    output.append(deviation_in_time)
                    if num_corrections > 0:
                        output.append(num_corrections)
                    else:
                        output.append(0)

                    spamwriter.writerow(output)
def run_experiments(log_identificator, formula_type, rnn_type):

    eventlog, \
        path_to_model_file_cf, \
        path_to_model_file_cfr, \
        path_to_declare_model_file, \
        beam_size, \
        prefix_size_pred_from, \
        prefix_size_pred_to, \
        formula = activate_settings(log_identificator, formula_type)

    if rnn_type == "CF":
        path_to_model_file = path_to_model_file_cf
    elif rnn_type == "CFR":
        path_to_model_file = path_to_model_file_cfr

    start_time = time.time()

    # prepare the data N.B. maxlen == predict_size
    lines, \
        lines_id, \
        lines_group, \
        lines_t, \
        lines_t2, \
        lines_t3, \
        lines_t4, \
        maxlen, \
        chars, \
        chars_group, \
        char_indices, \
        char_indices_group, \
        divisor, \
        divisor2, \
        divisor3, \
        predict_size, \
        target_indices_char, \
        target_indices_char_group,\
        target_char_indices, \
        target_char_indices_group = prepare_testing_data(eventlog)

    # load model, set this to the model generated by train.py
    model = load_model(path_to_model_file)

    # define helper functions
    # this one encodes the current sentence into the onehot encoding
    # noinspection PyUnusedLocal
    def encode(sentence,
               sentence_group,
               times_enc,
               times3_enc,
               maxlen_enc=maxlen):
        num_features = len(chars) + len(chars_group) + 5
        x = np.zeros((1, maxlen_enc, num_features), dtype=np.float32)
        leftpad = maxlen_enc - len(sentence)
        times2_enc = np.cumsum(times_enc)
        for v, char in enumerate(sentence):
            midnight = times3_enc[v].replace(hour=0,
                                             minute=0,
                                             second=0,
                                             microsecond=0)
            timesincemidnight = times3_enc[v] - midnight
            multiset_abstraction = Counter(sentence[:v + 1])
            for c in chars:
                if c == char:
                    x[0, v + leftpad, char_indices[c]] = 1
            for g in chars_group:
                if g == sentence_group[v]:
                    x[0, v + leftpad,
                      len(char_indices) + char_indices_group[g]] = 1
            x[0, v + leftpad, len(chars) + len(chars_group)] = v + 1
            x[0, v + leftpad,
              len(chars) + len(chars_group) + 1] = times_enc[v] / divisor
            x[0, v + leftpad,
              len(chars) + len(chars_group) + 2] = times2_enc[v] / divisor2
            x[0, v + leftpad,
              len(chars) + len(chars_group) +
              3] = timesincemidnight.seconds / 86400
            x[0, v + leftpad,
              len(chars) + len(chars_group) + 4] = times3_enc[v].weekday() / 7
        return x

    # modify to be able to get second best prediction
    def get_symbol(predictions, vth_best=0):
        v = np.argsort(predictions)[len(predictions) - vth_best - 1]
        return target_indices_char[v]

    def get_symbol_group(predictions, vth_best=0):
        v = np.argsort(predictions)[len(predictions) - vth_best - 1]
        return target_indices_char_group[v]

    one_ahead_gt = []
    one_ahead_pred = []

    with open(
            'output_files/final_experiments/results/baseline/%s_%s.csv' %
        (eventlog[:-4], rnn_type), 'wb') as csvfile:
        spamwriter = csv.writer(csvfile,
                                delimiter=',',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow([
            "Prefix length", "Ground truth", "Predicted", "Levenshtein",
            "Damerau", "Jaccard", "Ground truth times", "Predicted times",
            "RMSE", "MAE", "Median AE", "Ground Truth Group",
            "Predicted Group", "Levenshtein Group"
        ])
        for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to):

            lines_s,\
                lines_id_s,\
                lines_group_s, \
                lines_t_s, \
                lines_t2_s, \
                lines_t3_s,\
                lines_t4_s = select_declare_verified_traces(path_to_declare_model_file,
                                                            lines,
                                                            lines_id,
                                                            lines_group,
                                                            lines_t,
                                                            lines_t2,
                                                            lines_t3,
                                                            lines_t4,
                                                            prefix_size)

            print(prefix_size)
            print("formulas verified: " + str(len(lines_s)) + " out of : " +
                  str(len(lines)))
            for line, line_id, line_group, times, times2, times3, times4 in zip(
                    lines_s, lines_id_s, lines_group_s, lines_t_s, lines_t2_s,
                    lines_t3_s, lines_t4_s):
                times.append(0)
                cropped_line = ''.join(line[:prefix_size])
                cropped_line_group = ''.join(line_group[:prefix_size])
                cropped_times = times[:prefix_size]
                cropped_times3 = times3[:prefix_size]
                cropped_times4 = times4[:prefix_size]
                if len(times2) < prefix_size:
                    continue  # make no prediction for this case, since this case has ended already
                ground_truth = ''.join(line[prefix_size:prefix_size +
                                            predict_size])
                ground_truth_group = ''.join(
                    line_group[prefix_size:prefix_size + predict_size])
                ground_truth_t = times2[prefix_size - 1]
                case_end_time = times2[len(times2) - 1]
                ground_truth_t = case_end_time - ground_truth_t
                predicted = ''
                predicted_group = ''
                total_predicted_time = 0
                for i in range(predict_size):
                    enc = encode(cropped_line, cropped_line_group,
                                 cropped_times, cropped_times3)
                    y = model.predict(enc, verbose=0)  # make predictions
                    # split predictions into seperate activity and time predictions
                    y_char = y[0][0]
                    y_group = y[1][0]
                    y_t = y[2][0][0]
                    prediction = get_symbol(y_char)  # undo one-hot encoding
                    prediction_group = get_symbol_group(
                        y_group)  # undo one-hot encoding
                    cropped_line += prediction
                    cropped_line_group += prediction_group

                    # adds a fake timestamp to the list
                    t = time.strptime(cropped_times4[-1], "%Y-%m-%d %H:%M:%S")
                    new_timestamp = datetime.fromtimestamp(
                        time.mktime(t)) + timedelta(0, 2000)
                    cropped_times4.append(
                        new_timestamp.strftime("%Y-%m-%d %H:%M:%S"))

                    if y_t < 0:
                        y_t = 0
                    cropped_times.append(y_t)
                    # end of case was just predicted, therefore, stop predicting further into the future
                    if prediction == '!':
                        one_ahead_pred.append(total_predicted_time)
                        one_ahead_gt.append(ground_truth_t)
                        print('! predicted, end case')
                        break
                    y_t = y_t * divisor3
                    cropped_times3.append(cropped_times3[-1] +
                                          timedelta(seconds=y_t))
                    total_predicted_time = total_predicted_time + y_t
                    predicted += prediction
                    predicted_group += prediction_group
                output = []
                if len(ground_truth) > 0:
                    output.append(prefix_size)
                    output.append(unicode(ground_truth).encode("utf-8"))
                    output.append(unicode(predicted).encode("utf-8"))
                    output.append(
                        1 - distance.nlevenshtein(predicted, ground_truth))
                    dls = 1 - (damerau_levenshtein_distance(
                        unicode(predicted), unicode(ground_truth)) /
                               max(len(predicted), len(ground_truth)))
                    # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where
                    # the default character encoding of the operating system caused it to be negative,
                    # this should never be the case
                    if dls < 0:
                        dls = 0
                    output.append(dls)
                    output.append(1 -
                                  distance.jaccard(predicted, ground_truth))
                    output.append(ground_truth_t)
                    output.append(total_predicted_time)
                    output.append('')
                    output.append(
                        metrics.mean_absolute_error([ground_truth_t],
                                                    [total_predicted_time]))
                    output.append(
                        metrics.median_absolute_error([ground_truth_t],
                                                      [total_predicted_time]))
                    output.append(unicode(ground_truth_group).encode("utf-8"))
                    output.append(unicode(predicted_group).encode("utf-8"))
                    output.append(1 - distance.nlevenshtein(
                        predicted_group, ground_truth_group))
                    spamwriter.writerow(output)
    print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
                total_predicted_time = total_predicted_time + y_t
                predicted += prediction
            output = []
            if len(ground_truth) > 0:
                output.append(caseid)
                output.append(prefix_size)
                output.append(ground_truth)
                output.append(predicted)
                #print('predicted: ' , predicted)
                #print('ground_truth: ' , ground_truth)
                output.append(1 -
                              distance.nlevenshtein(predicted, ground_truth))
                #print('distaance nlevenshtein: ' , distance.nlevenshtein(predicted, ground_truth))
                #dls = 1 - (damerau_levenshtein_distance(unicode(predicted), unicode(ground_truth)) / max(len(predicted),len(ground_truth)))
                dls = 1 - (
                    damerau_levenshtein_distance(predicted, ground_truth) /
                    max(len(predicted), len(ground_truth)))
                #print('distaance damerau_levenshtein_distance: ' , damerau_levenshtein_distance(predicted, ground_truth))
                #print( 'max  ',max(len(predicted),len(ground_truth)))
                #print( 'jaccard  ',distance.jaccard(predicted, ground_truth))
                #print( 'cos  ',cossim(predicted, ground_truth))

                if dls < 0:
                    dls = 0  # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case
                output.append(dls)
                output.append(1 - distance.jaccard(predicted, ground_truth))
                output.append(ground_truth_t)
                output.append(total_predicted_time)
                output.append('')
                output.append(
                    metrics.mean_absolute_error([ground_truth_t],
Esempio n. 14
0
def damerau_levenshtein_distance(a, b):
    try:
        return jellyfish.damerau_levenshtein_distance(a, b)
    except ValueError:  # c implementation can't deal with unicode, fall back to (slower) python
        return py_jellyfish.damerau_levenshtein_distance(a, b)
def runExperiments(logIdentificator, formulaType):
    eventlog, path_to_model_file, beam_size, \
        prefix_size_pred_from, prefix_size_pred_to, formula = activateSettings(logIdentificator, formulaType)


    start_time = time.time()

    csvfile = open('../data/%s' % eventlog, 'r')
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')

    next(spamreader, None)  # skip the headers


    lastcase = ''
    line = ''
    firstLine = True
    lines = []
    timeseqs = []  # relative time since previous event
    timeseqs2 = [] # relative time since case start
    timeseqs3 = [] # absolute time of previous event
    times = []
    times2 = []
    times3 = []
    numlines = 0
    casestarttime = None
    lasteventtime = None

    for row in spamreader:
        t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S")
        if row[0]!=lastcase:
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not firstLine:
                lines.append(line)
                timeseqs.append(times)
                timeseqs2.append(times2)
                timeseqs3.append(times3)
            line = ''
            times = []
            times2 = []
            times3 = []
            numlines+=1
        line+= getUnicode_fromInt(row[1])
        timesincelastevent = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(lasteventtime))
        timesincecasestart = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(casestarttime))
        midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0)
        timesincemidnight = datetime.fromtimestamp(time.mktime(t))-midnight
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
        timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
        times.append(timediff)
        times2.append(timediff2)
        times3.append(datetime.fromtimestamp(time.mktime(t)))
        lasteventtime = t
        firstLine = False

    # add last case
    lines.append(line)
    timeseqs.append(times)
    timeseqs2.append(times2)
    timeseqs3.append(times3)
    numlines+=1

    divisor = np.mean([item for sublist in timeseqs for item in sublist])
    print('divisor: {}'.format(divisor))
    divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist])
    print('divisor2: {}'.format(divisor2))
    divisor3 = np.mean(map(lambda x: np.mean(map(lambda y: x[len(x)-1]-y, x)), timeseqs2))
    print('divisor3: {}'.format(divisor3))

    elems_per_fold = int(round(numlines/3))

    fold1and2lines = lines[:2*elems_per_fold]

    step = 1
    sentences = []
    softness = 0
    next_chars = []
    fold1and2lines = map(lambda x: x+'!',fold1and2lines)
    maxlen = max(map(lambda x: len(x),fold1and2lines))

    chars = map(lambda x : set(x),fold1and2lines)
    chars = list(set().union(*chars))
    chars.sort()
    target_chars = copy.copy(chars)
    chars.remove('!')
    print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars)))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    target_char_indices = dict((c, i) for i, c in enumerate(target_chars))
    target_indices_char = dict((i, c) for i, c in enumerate(target_chars))
    print(indices_char)

    #we only need the third fold, because first two were used for training

    fold3 = lines[2*elems_per_fold:]
    fold3_t = timeseqs[2*elems_per_fold:]
    fold3_t2 = timeseqs2[2*elems_per_fold:]
    fold3_t3 = timeseqs3[2*elems_per_fold:]

    lines = fold3
    lines_t = fold3_t
    lines_t2 = fold3_t2
    lines_t3 = fold3_t3

    # set parameters
    predict_size = maxlen

    # load model, set this to the model generated by train.py
    model = load_model(path_to_model_file)

    # define helper functions

    #this one encodes the current sentence into the onehot encoding
    def encode(sentence, times, times3, maxlen=maxlen):
        num_features = len(chars)+5
        X = np.zeros((1, maxlen, num_features), dtype=np.float32)
        leftpad = maxlen-len(sentence)
        times2 = np.cumsum(times)
        for t, char in enumerate(sentence):
            midnight = times3[t].replace(hour=0, minute=0, second=0, microsecond=0)
            timesincemidnight = times3[t]-midnight
            multiset_abstraction = Counter(sentence[:t+1])
            for c in chars:
                if c==char:
                    X[0, t+leftpad, char_indices[c]] = 1
            X[0, t+leftpad, len(chars)] = t+1
            X[0, t+leftpad, len(chars)+1] = times[t]/divisor
            X[0, t+leftpad, len(chars)+2] = times2[t]/divisor2
            X[0, t+leftpad, len(chars)+3] = timesincemidnight.seconds/86400
            X[0, t+leftpad, len(chars)+4] = times3[t].weekday()/7
        return X

    #modify to be able to get second best prediction
    def getSymbol(predictions, ith_best = 0):
        i = np.argsort(predictions)[len(predictions) - ith_best - 1]
        return target_indices_char[i]

    one_ahead_gt = []
    one_ahead_pred = []

    two_ahead_gt = []
    two_ahead_pred = []

    three_ahead_gt = []
    three_ahead_pred = []


    with open('output_files/results/'+formulaType+'/suffix_and_remaining_time0_%s' % eventlog, 'wb') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE"])
        for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to):
            lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces(lines, lines_t, lines_t2, lines_t3, formula, prefix_size)
            print(prefix_size)
            print("formulas verifited: " + str(len(lines_s)) + " out of : " + str(len(lines)))
            for line, times, times2, times3 in izip(lines_s, lines_t_s, lines_t2_s, lines_t3_s):
                times.append(0)
                cropped_line = ''.join(line[:prefix_size])
                cropped_times = times[:prefix_size]
                cropped_times3 = times3[:prefix_size]
                if len(times2)<prefix_size:
                    continue # make no prediction for this case, since this case has ended already
                ground_truth = ''.join(line[prefix_size:prefix_size+predict_size])
                ground_truth_t = times2[prefix_size-1]
                case_end_time = times2[len(times2)-1]
                ground_truth_t = case_end_time-ground_truth_t
                predicted = ''
                total_predicted_time = 0
                for i in range(predict_size):
                    enc = encode(cropped_line, cropped_times, cropped_times3)
                    y = model.predict(enc, verbose=0) # make predictions
                    # split predictions into seperate activity and time predictions
                    y_char = y[0][0]
                    y_t = y[1][0][0]
                    prediction = getSymbol(y_char) # undo one-hot encoding
                    cropped_line += prediction
                    if y_t<0:
                        y_t=0
                    cropped_times.append(y_t)
                    if prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future
                        one_ahead_pred.append(total_predicted_time)
                        one_ahead_gt.append(ground_truth_t)
                        print('! predicted, end case')
                        break
                    y_t = y_t * divisor3
                    cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t))
                    total_predicted_time = total_predicted_time + y_t
                    predicted += prediction
                output = []
                if len(ground_truth)>0:
                    output.append(prefix_size)
                    output.append(unicode(ground_truth).encode("utf-8"))
                    output.append(unicode(predicted).encode("utf-8"))
                    output.append(1 - distance.nlevenshtein(predicted, ground_truth))
                    dls = 1 - (damerau_levenshtein_distance(unicode(predicted), unicode(ground_truth)) / max(len(predicted),len(ground_truth)))
                    if dls<0:
                        dls=0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case
                    output.append(dls)
                    output.append(1 - distance.jaccard(predicted, ground_truth))
                    output.append(ground_truth_t)
                    output.append(total_predicted_time)
                    output.append('')
                    output.append(metrics.mean_absolute_error([ground_truth_t], [total_predicted_time]))
                    output.append(metrics.median_absolute_error([ground_truth_t], [total_predicted_time]))
                    spamwriter.writerow(output)
    print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
Esempio n. 16
0
def test(args, preprocess_manager):

    # get test set
    lines, case_ids, lines_t, lines_t2, lines_t3, seq_max_length, num_features_all, num_features_activities = preprocess_manager.create_test_set(
    )
    model_suffix_prediction = load_model('%smodel_suffix_prediction.h5' %
                                         args.checkpoint_dir)  # load model

    data_set_name = args.data_set.split('.csv')[
        0]  # set options for result output
    generic_result_dir = args.result_dir + data_set_name + "_" + args.task
    result_dir = generic_result_dir + ".csv"

    # start prediction
    with open(result_dir, 'w') as csvfile:
        spamwriter = csv.writer(csvfile,
                                delimiter=';',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow([
            "CaseID", "Prefix length", "Ground truth", "Predicted",
            "Levenshtein", "Damerau", "Jaccard", "Ground truth times",
            "Predicted times", "MAE", "In time", "Dev. in time",
            "Num interventions"
        ])

        for line, case_id, times, times2, times3 in zip(
                lines, case_ids, lines_t, lines_t2, lines_t3):
            for prefix_size in range(2, seq_max_length):  # size > 1

                print("\nPrefix size: %d" % prefix_size)

                # current = ground truth prefix + predicted suffix
                current = {
                    "line": ''.join(line[:prefix_size]),
                    "times": times[:prefix_size],
                    "times2": times2[:prefix_size],
                    "times3": times3[:prefix_size]
                }

                if '!' in current["line"]:  # termination
                    break

                ground_truth = {
                    "total_event":
                    ''.join(line[:]),
                    "prefix_event":
                    ''.join(line[:prefix_size]),
                    "suffix_event":
                    ''.join(line[prefix_size:]),
                    "total_time":
                    times2[len(times2) - 1],
                    "prefix_time":
                    times2[prefix_size - 1],
                    "suffix_time":
                    times2[len(times2) - 1] - times2[prefix_size - 1]
                }

                predict = {
                    "size": seq_max_length - 1,
                    "predicted": '',
                    "suffix_time": 0
                }

                if args.next_best_action:
                    # Check prefix conformance
                    if preprocess_manager.check_candidate(
                            args,
                            preprocess_manager.transform_new_instance(
                                ground_truth["prefix_event"])):
                        predict, in_time, deviation_in_time, num_interventions = predict_suffix_and_time_for_prefix_next_best_event(
                            args, model_suffix_prediction, preprocess_manager,
                            current, predict, ground_truth)
                    else:
                        break
                else:
                    # Check prefix conformance
                    if preprocess_manager.check_candidate(
                            args,
                            preprocess_manager.transform_new_instance(
                                ground_truth["prefix_event"])):
                        predict, in_time, deviation_in_time = predict_suffix_and_time_for_prefix(
                            model_suffix_prediction, preprocess_manager,
                            current, predict, ground_truth)
                    else:
                        break

                if predict["predicted"] == "":  # termination
                    continue

                output = []
                if len(ground_truth["suffix_event"]) > 0:

                    output.append(case_id)
                    output.append(prefix_size)
                    output.append(
                        str(ground_truth["suffix_event"]).encode("utf-8"))
                    output.append(str(predict["predicted"]).encode("utf-8"))
                    output.append(1 - distance.nlevenshtein(
                        predict["predicted"], ground_truth["suffix_event"]))

                    dls = 1 - (damerau_levenshtein_distance(
                        str(predict["predicted"]),
                        str(ground_truth["suffix_event"])) /
                               max(len(predict["predicted"]),
                                   len(ground_truth["suffix_event"])))
                    if dls < 0:
                        dls = 0
                    output.append(dls)
                    output.append(1 - distance.jaccard(
                        predict["predicted"], ground_truth["suffix_event"]))
                    output.append(ground_truth["suffix_time"])
                    output.append(predict["suffix_time"])
                    output.append(
                        metrics.mean_absolute_error(
                            [ground_truth["suffix_time"]],
                            [predict["suffix_time"]]))
                    output.append(in_time)
                    output.append(deviation_in_time)
                    if args.next_best_action:
                        output.append(num_interventions)
                    else:
                        output.append(0)
                    spamwriter.writerow(output)
Esempio n. 17
0
def test_damerau_levenshtein_distance_type(jf):
    jf.damerau_levenshtein_distance(u"abc", u"abc")
    with pytest.raises(TypeError) as exc:
        jf.damerau_levenshtein_distance(b"abc", b"abc")
    assert "expected" in str(exc.value)
Esempio n. 18
0
def fuzzy_value_scoring(values_list1, values_list2):
    """
	string pairwise matcher
	NB only best matches are taken this is not all by all
	gets fuzzy pair match based on jarowinkler
	returns dict with mean, stc and 0.9 qualtile
	for jarowinkler, damerau levenshtein and hamming distances

	If the number of values is too long (>1000) the most frequently
	used values are taken as best representatives. This is to make
	computation doable.


	"""
    if len(values_list1) > 0 and len(values_list2) > 0:

        if len(values_list1) > 1000 or len(values_list2) > 1000:
            if len(values_list1) > 1000:
                x = value_info.get(facet1)
                value_df = pd.DataFrame(columns=['frequency']).from_dict(
                    x, orient='index').reset_index().rename(columns={
                        "index": "value",
                        0: "frequency"
                    }).sort_values(['frequency'], ascending=False).head(n=1000)
                values_list1 = value_df['value'].tolist()
            if len(values_list2) > 1000:
                x = value_info.get(facet2)
                value_df = pd.DataFrame(columns=['frequency']).from_dict(
                    x, orient='index').reset_index().rename(columns={
                        "index": "value",
                        0: "frequency"
                    }).sort_values(['frequency'], ascending=False).head(n=1000)
                values_list2 = value_df['value'].tolist()

        if len(values_list1) > len(values_list2):
            short_list = values_list2
            long_list = values_list1
        else:
            short_list = values_list1
            long_list = values_list2

        # calculate the best fuzzy matches
        best_match_list = []
        for value1 in short_list:
            jaro_distance_list = []
            for value2 in long_list:

                try:
                    damerau_levenshtein_distance = jellyfish.damerau_levenshtein_distance(
                        value1, value2)
                except ValueError:
                    damerau_levenshtein_distance = py_jellyfish.damerau_levenshtein_distance(
                        value1, value2)

                jaro_winkler = jellyfish.jaro_winkler(value1, value2)
                hamming_distance = jellyfish.hamming_distance(value1, value2)

                jaro_tuple = (value1, value2, jaro_winkler,
                              damerau_levenshtein_distance, hamming_distance)
                jaro_distance_list.append(jaro_tuple)
            best_match = max(jaro_distance_list, key=lambda x: x[2])
            best_match_list.append(best_match)
        df = pd.DataFrame(best_match_list,
                          columns=[
                              'facet1', 'facet2', 'jaro_distance',
                              'damerau_levenshtein_distance',
                              'hamming_distance'
                          ])

        jaro_distance_quant = df['jaro_distance'].quantile(0.9)
        jaro_distance_mean = df['jaro_distance'].mean()
        jaro_distance_std = df['jaro_distance'].std()
        damerau_levenshtein_distance_quant = df[
            'damerau_levenshtein_distance'].quantile(0.9)
        damerau_levenshtein_distance_mean = df[
            'damerau_levenshtein_distance'].mean()
        damerau_levenshtein_distance_std = df[
            'damerau_levenshtein_distance'].std()
        hamming_distance_quant = df['hamming_distance'].quantile(0.9)
        hamming_distance_mean = df['hamming_distance'].mean()
        hamming_distance_std = df['hamming_distance'].std()

        results = {
            'jaro_distance_quant': jaro_distance_quant,
            'jaro_distance_mean': jaro_distance_mean,
            'jaro_distance_std': jaro_distance_std,
            'damerau_levenshtein_distance_quant':
            damerau_levenshtein_distance_quant,
            'damerau_levenshtein_distance_mean':
            damerau_levenshtein_distance_mean,
            'damerau_levenshtein_distance_std':
            damerau_levenshtein_distance_std,
            'hamming_distance_quant': hamming_distance_quant,
            'hamming_distance_mean': hamming_distance_mean,
            'hamming_distance_std': hamming_distance_std
        }
        # so a good match will be a high mean, low std. The quantile is prob better than mean.

        return results
    else:

        # 'N.A.' returned if one or both of the facets dont have any values.


        results = {'jaro_distance_quant':'N.A.', \
        'jaro_distance_mean':'N.A.', \
        'jaro_distance_std':'N.A.', \
        'damerau_levenshtein_distance_quant':'N.A.', \
        'damerau_levenshtein_distance_mean':'N.A.', \
        'damerau_levenshtein_distance_std':'N.A.', \
        'hamming_distance_quant':'N.A.', \
        'hamming_distance_mean':'N.A.', \
        'hamming_distance_std':'N.A.'}

        return results
Esempio n. 19
0
def evaluate(train_log, test_log, model_folder, model_file):
    caseid_col = 0
    role_col = 2
    task_col = 1

    csvfile = open(train_log, 'r')
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers
    ascii_offset = 161

    lastcase = ''
    line = ''
    firstLine = True
    lines = []
    caseids = []
    numlines = 0
    for row in spamreader:
        if row[caseid_col]!=lastcase:
            caseids.append(row[caseid_col])
            lastcase = row[caseid_col]
            if not firstLine:
                lines.append(line)
            line = ''
            times = []
            numlines+=1
        line+=chr(int(row[task_col])+ascii_offset)
        firstLine = False

    # add last case
    lines.append(line)
    numlines+=1

    csvfile = open(test_log, 'r')
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers
    ascii_offset = 161

    for row in spamreader: #the rows are "CaseID,ActivityID,CompleteTimestamp"
        if row[caseid_col]!=lastcase:  #'lastcase' is to save the last executed case for the loop
            lastcase = row[caseid_col]
            if not firstLine:
                lines.append(line)
            line = ''
            numlines+=1
        line+=chr(int(row[task_col])+ascii_offset)
        firstLine = False

    # add last case
    lines.append(line)
    numlines+=1


    step = 1
    sentences = []
    softness = 0
    next_chars = []
    lines = list(map(lambda x: x+'!',lines))
    maxlen = max(map(lambda x: len(x),lines))

    chars = map(lambda x : set(x),lines)
    chars = list(set().union(*chars))
    chars.sort()
    target_chars = copy.copy(chars)
    chars.remove('!')
    print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars)))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    target_char_indices = dict((c, i) for i, c in enumerate(target_chars))
    target_indices_char = dict((i, c) for i, c in enumerate(target_chars))
    print(indices_char)

    lastcase = ''
    line = ''
    firstLine = True
    lines = []
    caseids = []
    numlines = 0

    csvfile = open(test_log, 'r')
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers
    for row in spamreader:
        if row[caseid_col]!=lastcase:
            caseids.append(row[caseid_col])
            lastcase = row[caseid_col]
            if not firstLine:
                lines.append(line)
            line = ''
            times = []
            numlines+=1
        line+=chr(int(row[task_col])+ascii_offset)
        firstLine = False

    # add last case
    lines.append(line)
    numlines+=1

    # set parameters
    predict_size = 1

    # load model, set this to the model generated by train.py
    model = load_model(os.path.join(model_folder, model_file))

    # define helper functions
    def encode(sentence, maxlen=maxlen):
        num_features = len(chars)+1
        X = np.zeros((1, maxlen, num_features), dtype=np.float32)
        leftpad = maxlen-len(sentence)
        for t, char in enumerate(sentence):
            multiset_abstraction = Counter(sentence[:t+1])
            for c in chars:
                if c==char:
                    X[0, t+leftpad, char_indices[c]] = 1
            X[0, t+leftpad, len(chars)] = t+1
        return X

    def getSymbol(predictions):
        maxPrediction = 0
        symbol = ''
        i = 0
        for prediction in predictions:
            if(prediction>=maxPrediction):
                maxPrediction = prediction
                symbol = target_indices_char[i]
            i += 1
        return symbol

    one_ahead_gt = []
    one_ahead_pred = []

    two_ahead_gt = []
    two_ahead_pred = []

    three_ahead_gt = []
    three_ahead_pred = []


    # make predictions
    with open(os.path.join(model_folder, "predictions.csv"), 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["CaseID", "Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE"])
        for prefix_size in range(1,maxlen):
            print(prefix_size)
            for line, caseid in zip(lines, caseids):
                cropped_line = ''.join(line[:prefix_size])
                if '!' in cropped_line:
                    continue # make no prediction for this case, since this case has ended already
                ground_truth = ''.join(line[prefix_size:prefix_size+predict_size])
                predicted = ''
                for i in range(predict_size):
                    if len(ground_truth)<=i:
                        continue
                    enc = encode(cropped_line)
                    y = model.predict(enc, verbose=0)
                    y_char = y[0]
                    prediction = getSymbol(y_char)
                    cropped_line += prediction
                    if prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future
                        break
                    predicted += prediction
                output = []
                if len(ground_truth)>0:
                    output.append(caseid)
                    output.append(prefix_size)
                    output.append(str(ground_truth))
                    output.append(str(predicted))
                    output.append(1 - distance.nlevenshtein(predicted, ground_truth))
                    dls = 1 - (damerau_levenshtein_distance(str(predicted), str(ground_truth)) / max(len(predicted),len(ground_truth)))
                    if dls<0:
                        dls=0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case
                    output.append(dls)
                    output.append(1 - distance.jaccard(predicted, ground_truth))
                    output.append(' ')
                    output.append(' ')

                    output.append('')
                    output.append('')
                    output.append('')
                    spamwriter.writerow(output)
def run_experiments(log_identificator, formula_type):
    eventlog, path_to_model_file, beam_size, \
        prefix_size_pred_from, prefix_size_pred_to, formula = activateSettings(log_identificator, formula_type)

    current_path = os.path.abspath(getsourcefile(lambda: 0))
    current_dir = os.path.dirname(current_path)
    parent_dir = current_dir[:current_dir.rfind(os.path.sep)]

    sys.path.insert(0, parent_dir)

    start_time = time.time()

    lines, lines_t, lines_t2, lines_t3, maxlen, chars, char_indices, divisor, divisor2, \
        divisor3, predict_size, target_indices_char, target_char_indices = prepare_testing_data(eventlog)

    # find cycles and modify the probability functionality goes here
    stop_symbol_probability_amplifier_current = 1

    # modify to be able to get second best prediction
    def getSymbol(predictions, ith_best=0):
        predictions[
            0] = predictions[0] * stop_symbol_probability_amplifier_current
        i = np.argsort(predictions)[len(predictions) - ith_best - 1]
        return target_indices_char[i]

    one_ahead_gt = []
    one_ahead_pred = []

    # load model, set this to the model generated by train.py
    model = load_model(path_to_model_file)
    stop_symbol_probability_amplifier_current = 1
    # make predictions
    with open(
            'output_files/results/' + formula_type +
            '/suffix_and_remaining_time2_%s' % eventlog, 'wb') as csvfile:
        spamwriter = csv.writer(csvfile,
                                delimiter=',',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow([
            "Prefix length", "Groud truth", "Predicted", "Levenshtein",
            "Damerau", "Jaccard", "Ground truth times", "Predicted times",
            "RMSE", "MAE", "Median AE"
        ])
        for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to):
            # here we checkout the prefixes with formulas verified only on the suffix phase
            lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces(
                lines, lines_t, lines_t2, lines_t3, formula, prefix_size)
            print("prefix size: " + str(prefix_size))
            print("formulas verifited: " + str(len(lines_s)) + " out of : " +
                  str(len(lines)))
            for line, times, times2, times3 in zip(lines_s, lines_t_s,
                                                   lines_t2_s, lines_t3_s):
                prediction_end_reached = False
                times.append(0)
                cropped_line = ''.join(line[:prefix_size])
                cropped_times = times[:prefix_size]
                cropped_times3 = times3[:prefix_size]
                if len(times2) < prefix_size:
                    continue  # make no prediction for this case, since this case has ended already

                # initialize root of the tree for beam search
                total_predicted_time_initialization = 0
                search_tree_root = MultileafTree(
                    beam_size,
                    encode(cropped_line, cropped_times, cropped_times3, maxlen,
                           chars, char_indices, divisor, divisor2),
                    cropped_line, total_predicted_time_initialization)

                prediction_end_reached = False

                ground_truth = ''.join(line[prefix_size:prefix_size +
                                            predict_size])
                ground_truth_t = times2[prefix_size - 1]
                case_end_time = times2[len(times2) - 1]
                ground_truth_t = case_end_time - ground_truth_t
                predicted = ''

                for i in range(predict_size):
                    # here we will take data from the node in the tree used to prun
                    enc = search_tree_root.data  # encode(cropped_line, cropped_times, cropped_times3)
                    y = model.predict(enc, verbose=0)  # make predictions
                    # split predictions into seperate activity and time predictions
                    y_char = y[0][0]
                    y_t = y[1][0][0]

                    stop_symbol_probability_amplifier_current, \
                        start_of_the_cycle_symbol = amplify(search_tree_root.cropped_line)

                    # cropped_line += prediction
                    if y_t < 0:
                        y_t = 0
                    # TOO not normalizing here seems like a bug
                    cropped_times.append(y_t)

                    ma = False
                    for i in range(beam_size):
                        prediction = getSymbolAmpl(
                            y_char, target_indices_char, target_char_indices,
                            start_of_the_cycle_symbol,
                            stop_symbol_probability_amplifier_current, i)
                        # end of case was just predicted, therefore, stop predicting further into the future
                        if prediction == '!':
                            if verify_formula_as_compliant(
                                    search_tree_root.cropped_line, formula,
                                    prefix_size):
                                one_ahead_pred.append(
                                    search_tree_root.total_predicted_time)
                                one_ahead_gt.append(ground_truth_t)
                                print('! predicted, end case')
                                ma = True
                                break

                            # else:
                            #     prediction_end_reached = True;
                    if ma:
                        break
                    # if the end of prediction was not reached we continue as always, and then function :choose_next_
                    # top_descendant: will earch for future prediction

                    # in not reached, function :choose_next_top_descendant: will backtrack
                    y_t = y_t * divisor3
                    if not prediction_end_reached:
                        cropped_times3.append(cropped_times3[-1] +
                                              timedelta(seconds=y_t))

                        for i in range(beam_size):
                            temp_prediction = getSymbolAmpl(
                                y_char, target_indices_char,
                                target_char_indices, start_of_the_cycle_symbol,
                                stop_symbol_probability_amplifier_current, i)
                            if temp_prediction == '!':
                                continue
                            temp_cropped_line = search_tree_root.cropped_line + temp_prediction

                            # this means that we found the end in one of the alternatives.
                            temp_total_predicted_time = search_tree_root.total_predicted_time + y_t

                            temp_state_data = encode(temp_cropped_line,
                                                     cropped_times,
                                                     cropped_times3, maxlen,
                                                     chars, char_indices,
                                                     divisor, divisor2)
                            search_tree_root.descendants[i] = MultileafTree(
                                beam_size, temp_state_data, temp_cropped_line,
                                temp_total_predicted_time, search_tree_root)

                    search_tree_root = search_tree_root.choose_next_top_descendant(
                    )
                    if prediction_end_reached:
                        prediction_end_reached = False
                    if search_tree_root is None:
                        print(
                            "Cannot find any trace that is compliant with formula given current beam size"
                        )
                        break

                output = []

                if search_tree_root is None:
                    predicted = u""
                    total_predicted_time = 0
                else:
                    predicted = (search_tree_root.cropped_line[prefix_size:])
                    total_predicted_time = search_tree_root.total_predicted_time

                if len(ground_truth) > 0:
                    output.append(prefix_size)
                    output.append(unicode(ground_truth).encode("utf-8"))
                    output.append(unicode(predicted).encode("utf-8"))
                    output.append(
                        1 - distance.nlevenshtein(predicted, ground_truth))
                    dls = 1 - (damerau_levenshtein_distance(
                        unicode(predicted), unicode(ground_truth)) /
                               max(len(predicted), len(ground_truth)))
                    if dls < 0:
                        dls = 0
                    # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the
                    # default character encoding of the operating system caused it to be negative, this should never
                    # be the case
                    output.append(dls)
                    output.append(1 -
                                  distance.jaccard(predicted, ground_truth))
                    output.append(ground_truth_t)
                    output.append(total_predicted_time)
                    output.append('')
                    output.append(
                        metrics.mean_absolute_error([ground_truth_t],
                                                    [total_predicted_time]))
                    output.append(
                        metrics.median_absolute_error([ground_truth_t],
                                                      [total_predicted_time]))
                    spamwriter.writerow(output)
    print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
def fuzzy_value_scoring(values_list1, values_list2):
    """
	string pairwise matcher
	NB only best matches are taken this is not all by all
	gets fuzzy pair match based on jarowinkler
	returns dict with mean, stc and 0.9 qualtile
	for jarowinkler, damerau levenshtein and hamming distances
	"""
    if len(values_list1) > 0 and len(values_list2) > 0:

        if len(values_list1) > len(values_list2):
            short_list = values_list2
            long_list = values_list1
        else:
            short_list = values_list1
            long_list = values_list2

        # calculate the best fuzzy matches
        best_match_list = []
        for value1 in short_list:
            jaro_distance_list = []
            for value2 in long_list:

                try:
                    damerau_levenshtein_distance = jellyfish.damerau_levenshtein_distance(
                        value1, value2)
                except ValueError:
                    damerau_levenshtein_distance = py_jellyfish.damerau_levenshtein_distance(
                        value1, value2)

                jaro_winkler = jellyfish.jaro_winkler(value1, value2)
                hamming_distance = jellyfish.hamming_distance(value1, value2)

                jaro_tuple = (value1, value2, jaro_winkler,
                              damerau_levenshtein_distance, hamming_distance)
                jaro_distance_list.append(jaro_tuple)
            best_match = max(jaro_distance_list, key=lambda x: x[2])
            best_match_list.append(best_match)
        df = pd.DataFrame(best_match_list,
                          columns=[
                              'facet1', 'facet2', 'jaro_distance',
                              'damerau_levenshtein_distance',
                              'hamming_distance'
                          ])

        jaro_distance_quant = df['jaro_distance'].quantile(0.9)
        jaro_distance_mean = df['jaro_distance'].mean()
        jaro_distance_std = df['jaro_distance'].std()
        damerau_levenshtein_distance_quant = df[
            'damerau_levenshtein_distance'].quantile(0.9)
        damerau_levenshtein_distance_mean = df[
            'damerau_levenshtein_distance'].mean()
        damerau_levenshtein_distance_std = df[
            'damerau_levenshtein_distance'].std()
        hamming_distance_quant = df['hamming_distance'].quantile(0.9)
        hamming_distance_mean = df['hamming_distance'].mean()
        hamming_distance_std = df['hamming_distance'].std()

        results = {'jaro_distance_quant':jaro_distance_quant, \
        'jaro_distance_mean':jaro_distance_mean, \
        'jaro_distance_std':jaro_distance_std, \
        'damerau_levenshtein_distance_quant':damerau_levenshtein_distance_quant, \
        'damerau_levenshtein_distance_mean':damerau_levenshtein_distance_mean, \
        'damerau_levenshtein_distance_std':damerau_levenshtein_distance_std, \
        'hamming_distance_quant':hamming_distance_quant, \
        'hamming_distance_mean':hamming_distance_mean, \
        'hamming_distance_std':hamming_distance_std}
        # so a good match will be a high mean, low std. The quantile is prob better than mean.

        return results
    else:

        # 'N.A.' returned if one or both of the facets dont have any values.


        results = {'jaro_distance_quant':'N.A.', \
        'jaro_distance_mean':'N.A.', \
        'jaro_distance_std':'N.A.', \
        'damerau_levenshtein_distance_quant':'N.A.', \
        'damerau_levenshtein_distance_mean':'N.A.', \
        'damerau_levenshtein_distance_std':'N.A.', \
        'hamming_distance_quant':'N.A.', \
        'hamming_distance_mean':'N.A.', \
        'hamming_distance_std':'N.A.'}

        return results
Esempio n. 22
0
    def test(self, ):
        """
        Generates a file with predictions for next activities and time


        ***Helper Variables***
        predict_size : int
            number of predictions
        model : tf.keras.models
            trained models complete path
        path1: str
            complete path of the model
        filename: str
            local path and name of the model
        path: str
            local path of model
        model_type: str
            name of the model
        file_name: str
            name of the output file
        spamwriter: object
            csv writer object
        prefix_size:int
            size of eventlog prefix_size
        self.lines: list
            these are all the activity seq
        self.char_indices : dict
            ascii coded characters of the unique activities to integer indices
        self.indices_char: dict
            integer indices to ascii coded characters of the unique activities 
        self.target_char_indices: dict
            ascii coded characters of the target unique activities to integer indices
            (target includes one excess activity '!' case end)
        self.target_indices_char: dict
            integer indices to ascii coded characters of the target unique activities    
        self.lines: list 
             ActivityIDs 
        self.lines_t: list
            differences between two events 
        self.lines_t2: list
            differences between the current and first of test_set
        self.lines_t3 : list
            Midnight time
        self.lines_t4 : list
            Day of the week
        self.one_ahead_gt : list
            helper variable to predict one ahead
        self.one_ahead_pred : list
            helper variable to predict one ahead
        self.two_ahead_gt : list
            helper variable to predict two ahead
        self.two_ahead_pred : list
            helper variable to predict two ahead
        self.three_ahead_gt :list
            helper variable to predict three ahead
        self.three_ahead_pred :list
            helper variable to predict three ahead  
        cropped_line: list
            running activities while predictions
        cropped_times: list
            running time differences while predictions
        cropped_times3: list
            running time difference from case starting
        line: char
            activity items
        time: float
            time difference current and previous event
        times3: float
            time difference current and fisrt event
        ground_truth: char
            Ground truth activity
        ground_truth_t: float
            Groud truth time difference
        predicted: char
            predicted activity as a char
        predicted: list
            predicted time storing list
        y : dict
            all predctions
        y_char : float
            numerical prediction for activities
        y_t: float
            direct time prediction
        output: list
            complete list of output
        """

        # set parameters
        predict_size = 1

        # load model, set this to the model generated by train.py
        model = tf.keras.models.load_model(
            self.model_name,
            compile=False,
            custom_objects={"TLSTM_layer": TLSTM_layer})

        #name of the output
        path1, filename = os.path.split(self.model_name)
        path, model_type = os.path.split(path1)

        #name
        file_name = model_type + self.eventlog

        # make predictions
        with open('Results/1hotnext_activity_and_time_%s' % file_name,
                  'w',
                  encoding="utf-8") as csvfile:
            spamwriter = csv.writer(csvfile,
                                    delimiter=',',
                                    quotechar='|',
                                    quoting=csv.QUOTE_MINIMAL)
            spamwriter.writerow([
                "CaseID", "Prefix length", "Groud truth", "Predicted",
                "Confidence", "Levenshtein", "Damerau", "Jaccard",
                "Ground truth times", "Predicted times", "RMSE", "MAE"
            ])
            for prefix_size in range(2, self.maxlen):
                print(prefix_size)
                for line, caseid, times, times3 in zip(self.lines,
                                                       self.caseids,
                                                       self.lines_t,
                                                       self.lines_t3):
                    times.append(0)
                    cropped_line = ''.join(line[:prefix_size])

                    cropped_times = times[:prefix_size]
                    cropped_times3 = times3[:prefix_size]

                    if '!' in cropped_line:
                        continue  # make no prediction for this case, since this case has ended already
                    ground_truth = ''.join(line[prefix_size:prefix_size +
                                                predict_size])
                    ground_truth_t = times[prefix_size:prefix_size +
                                           predict_size]
                    predicted = ''
                    predicted_t = []
                    for i in range(predict_size):
                        if len(ground_truth) <= i:
                            continue
                        enc = self.encode(cropped_line, cropped_times,
                                          cropped_times3, self.num_features)
                        y = model.predict(enc, verbose=0)
                        y_char = y[0][0]
                        y_t = y[1][0][0]
                        prediction = self.getSymbol(y_char)
                        confidence = np.round(np.max(y_char) * 100)

                        cropped_line += prediction
                        if y_t < 0:
                            y_t = 0
                        cropped_times.append(y_t)
                        y_t = y_t * self.divisor
                        cropped_times3.append(cropped_times3[-1] +
                                              timedelta(seconds=y_t))
                        predicted_t.append(y_t)
                        if i == 0:
                            if len(ground_truth_t) > 0:
                                self.one_ahead_pred.append(y_t)
                                self.one_ahead_gt.append(ground_truth_t[0])
                        if i == 1:
                            if len(ground_truth_t) > 1:
                                self.two_ahead_pred.append(y_t)
                                self.two_ahead_gt.append(ground_truth_t[1])
                        if i == 2:
                            if len(ground_truth_t) > 2:
                                self.three_ahead_pred.append(y_t)
                                self.three_ahead_gt.append(ground_truth_t[2])
                        if prediction == '!':  # end of case was just predicted, therefore, stop predicting further into the future
                            print('! predicted, end case')
                            break
                        predicted += prediction
                    output = []
                    if len(ground_truth) > 0:
                        output.append(caseid)
                        output.append(prefix_size)
                        output.append(str(ground_truth))
                        output.append(str(predicted))
                        output.append(confidence)
                        output.append(
                            1 - distance.nlevenshtein(predicted, ground_truth))
                        dls = 1 - (damerau_levenshtein_distance(
                            str(predicted), str(ground_truth)) /
                                   max(len(predicted), len(ground_truth)))
                        if dls < 0:
                            dls = 0  # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case
                        output.append(dls)
                        output.append(
                            1 - distance.jaccard(predicted, ground_truth))
                        output.append('; '.join(
                            str(x) for x in ground_truth_t))
                        output.append('; '.join(str(x) for x in predicted_t))
                        if len(predicted_t) > len(
                                ground_truth_t
                        ):  # if predicted more events than length of case, only use needed number of events for time evaluation
                            predicted_t = predicted_t[:len(ground_truth_t)]
                        if len(ground_truth_t) > len(
                                predicted_t
                        ):  # if predicted less events than length of case, put 0 as placeholder prediction
                            predicted_t.extend(
                                range(len(ground_truth_t) - len(predicted_t)))
                        if len(ground_truth_t) > 0 and len(predicted_t) > 0:
                            output.append('')
                            output.append(
                                metrics.mean_absolute_error(
                                    [ground_truth_t[0]], [predicted_t[0]]))
                            output.append(
                                metrics.median_absolute_error(
                                    [ground_truth_t[0]], [predicted_t[0]]))
                        else:
                            output.append('')
                            output.append('')
                            output.append('')
                        spamwriter.writerow(output)
Esempio n. 23
0
def test_damerau_levenshtein_distance_type(jf):
    jf.damerau_levenshtein_distance(u'abc', u'abc')
    with pytest.raises(TypeError) as exc:
        jf.damerau_levenshtein_distance(b'abc', b'abc')
    assert 'expected' in str(exc.value)
Esempio n. 24
0
def calc_lev_dist(a, b):
    return py_jellyfish.damerau_levenshtein_distance(a, b)
Esempio n. 25
0
def load_data(row):

    lev_dist = Levenshtein.distance(str(row[0]).lower(), str(row[1]).lower())
    jar_dist = jaro_distance(str(row[0]).lower(), str(row[1]).lower())
    dam_dist = damerau_levenshtein_distance(
        str(row[0]).lower(),
        str(row[1]).lower())

    q1 = parser(str(row[0]))
    q2 = parser(str(row[1]))

    set_ent1 = set([ele.label_.lower() for ele in q1.ents])
    set_ent2 = set([ele.label_.lower() for ele in q2.ents])

    num_ent, val_ent, rate_ent = feat(set_ent1, set_ent2)

    set_ent1 = set([' '.join(t.orth_ for t in ele) for ele in q1.ents])
    set_ent2 = set([' '.join(t.orth_ for t in ele) for ele in q2.ents])

    num_ent2, val_ent2, rate_ent2 = feat(set_ent1, set_ent2)

    list_last1 = [ele.lower_ for ele in q1 if ele.pos_ != 'PUNCT']
    list_last2 = [ele.lower_ for ele in q2 if ele.pos_ != 'PUNCT']
    num_for = 0
    val_for = 0.
    for i in range(min(len(list_last1), len(list_last2))):
        if list_last1[i] == list_last2[i] or match_rating_comparison(
                list_last1[i], list_last2[i]):
            num_for += 1
            val_for += weights.get(list_last1[i], 0)
        else:
            break

    list_last1.reverse()
    list_last2.reverse()
    num_clean2_rev = 0
    val_clean2_rev = 0.
    for i in range(min(len(list_last1), len(list_last2))):
        if list_last1[i] == list_last2[i] or match_rating_comparison(
                list_last1[i], list_last2[i]):
            num_clean2_rev += 1
            val_clean2_rev += weights.get(list_last1[i], 0)
        else:
            break

    set_sub1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'nsubj'])
    set_sub2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'nsubj'])

    num_sub, val_sub, rate_sub = feat(set_sub1, set_sub2)

    set_root1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'ROOT'])
    set_root2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'ROOT'])

    num_root, val_root, rate_root = feat(set_root1, set_root2)

    set_advmod1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'advmod'])
    set_advmod2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'advmod'])

    num_advmod, val_advmod, rate_advmod = feat(set_advmod1, set_advmod2)

    set_advcl1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'advcl'])
    set_advcl2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'advcl'])

    num_advcl, val_advcl, rate_advcl = feat(set_advcl1, set_advcl2)

    set_aux1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'aux'])
    set_aux2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'aux'])

    num_aux, val_aux, rate_aux = feat(set_aux1, set_aux2)

    set_dobj1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'dobj'])
    set_dobj2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'dobj'])

    num_dobj, val_dobj, rate_dobj = feat(set_dobj1, set_dobj2)

    # set_poss1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'poss'])
    # set_poss2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'poss'])

    # num_poss, val_poss, rate_poss = feat(set_poss1, set_poss2)

    set_noun1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'NOUN'])
    set_noun2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'NOUN'])

    num_noun, val_noun, rate_noun = feat(set_noun1, set_noun2)

    set_verb1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'VERB'])
    set_verb2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'VERB'])

    num_verb, val_verb, rate_verb = feat(set_verb1, set_verb2)

    set_adv1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'ADV'])
    set_adv2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'ADV'])

    num_adv, val_adv, rate_adv = feat(set_adv1, set_adv2)

    # set_adj1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'ADJ'])
    # set_adj2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'ADJ'])
    # num_adj, val_adj, rate_adj = feat(set_adj1, set_adj2)

    set_svo1 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower())
                    for ele in findSVOs(q1)])
    set_svo2 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower())
                    for ele in findSVOs(q2)])

    set_svo1 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]),
                     wnl.lemmatize(ele[2])) for ele in set_svo1])
    set_svo2 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]),
                     wnl.lemmatize(ele[2])) for ele in set_svo2])

    num_svo, val_svo, rate_svo = feat(set_svo1, set_svo2)

    set_s1 = set(ele[0] for ele in set_svo1)
    set_v1 = set(ele[1] for ele in set_svo1)
    set_o1 = set(ele[2] for ele in set_svo1)

    set_s2 = set(ele[0] for ele in set_svo2)
    set_v2 = set(ele[1] for ele in set_svo2)
    set_o2 = set(ele[2] for ele in set_svo2)

    num_s, val_s, rate_s = feat(set_s1, set_s2)

    num_v, val_v, rate_v = feat(set_v1, set_v2)

    num_o, val_o, rate_o = feat(set_o1, set_o2)

    list_ret = [
        num_ent,
        num_ent2,
        num_clean2_rev,
        num_for,
        lev_dist,
        jar_dist,
        dam_dist,
        num_sub,
        num_root,
        num_advmod,
        num_advcl,
        num_aux,  # num_poss,
        num_noun,
        num_verb,
        num_adv,  # num_adj,
        num_svo,
        num_s,
        num_v,
        num_o
    ]
    list_ret += [
        val_ent,
        val_ent2,
        val_clean2_rev,
        val_for,
        val_sub,
        val_root,
        val_advmod,
        val_advcl,
        val_aux,
        val_dobj,  # val_poss,
        val_noun,
        val_verb,
        val_adv,  # val_adj,
        val_svo,
        val_s,
        val_v,
        val_o
    ]
    list_ret += [
        rate_ent,
        rate_ent2,
        rate_sub,
        rate_root,
        rate_advmod,
        rate_advcl,
        rate_aux,
        rate_dobj,  # rate_poss,
        rate_noun,
        rate_verb,
        rate_adv,  # rate_adj,
        rate_svo,
        rate_s,
        rate_v,
        rate_o
    ]

    return list_ret
Esempio n. 26
0
     if i==2:
         if len(ground_truth_t)>2:
             three_ahead_pred.append(y_t)
             three_ahead_gt.append(ground_truth_t[2])
     if prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future
         print('! predicted, end case')
         break
     predicted += prediction
 output = []
 if len(ground_truth)>0:
     output.append(caseid)
     output.append(prefix_size)
     output.append(unicode(ground_truth).encode("utf-8"))
     output.append(unicode(predicted).encode("utf-8"))
     output.append(1 - distance.nlevenshtein(predicted, ground_truth))
     dls = 1 - (damerau_levenshtein_distance(unicode(predicted), unicode(ground_truth)) / max(len(predicted),len(ground_truth)))
     if dls<0:
         dls=0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case
     output.append(dls)
     output.append(1 - distance.jaccard(predicted, ground_truth))
     output.append('; '.join(str(x) for x in ground_truth_t))
     output.append('; '.join(str(x) for x in predicted_t))
     if len(predicted_t)>len(ground_truth_t): # if predicted more events than length of case, only use needed number of events for time evaluation
         predicted_t = predicted_t[:len(ground_truth_t)]
     if len(ground_truth_t)>len(predicted_t): # if predicted less events than length of case, put 0 as placeholder prediction
         predicted_t.extend(range(len(ground_truth_t)-len(predicted_t)))
     if len(ground_truth_t)>0 and len(predicted_t)>0:
         output.append('')
         output.append(metrics.mean_absolute_error([ground_truth_t[0]], [predicted_t[0]]))
         #output.append(metrics.median_absolute_error([ground_truth_t[0]], [predicted_t[0]]))
     else:
def run_experiments(log_identificator, formula_type, rnn_type):

    eventlog, \
        path_to_model_file_cf, \
        path_to_model_file_cfr, \
        path_to_declare_model_file, \
        beam_size, \
        prefix_size_pred_from, \
        prefix_size_pred_to, \
        formula = activate_settings(log_identificator, formula_type)

    if rnn_type == "CF":
        path_to_model_file = path_to_model_file_cf
    elif rnn_type == "CFR":
        path_to_model_file = path_to_model_file_cfr

    start_time = time.time()

    # prepare the data
    lines, \
        lines_id, \
        lines_group, \
        lines_t, \
        lines_t2, \
        lines_t3, \
        lines_t4, \
        maxlen, \
        chars, \
        chars_group, \
        char_indices, \
        char_indices_group, \
        divisor, \
        divisor2, \
        divisor3, \
        predict_size, \
        target_indices_char, \
        target_indices_char_group,\
        target_char_indices, \
        target_char_indices_group = prepare_testing_data(eventlog)

    # find cycles and modify the probability functionality goes here
    stop_symbol_probability_amplifier_current = 1

    # load model, set this to the model generated by train.py
    model = load_model(path_to_model_file)

    # Get the predicted group symbol
    def get_symbol_group(predictions, vth_best=0):
        v = np.argsort(predictions)[len(predictions) - vth_best - 1]
        return target_indices_char_group[v]

    class NodePrediction:
        def __init__(self,
                     data,
                     trace_id,
                     crop_line,
                     crop_line_group,
                     crop_times,
                     tot_predicted_time,
                     probability_of=0):
            self.data = data
            self.trace_id = trace_id
            self.cropped_line = crop_line
            self.cropped_line_group = crop_line_group
            self.cropped_times = crop_times
            self.total_predicted_time = tot_predicted_time
            self.probability_of = probability_of

    # make predictions
    with open(
            'output_files/final_experiments/results/LTL/%s_%s.csv' %
        (eventlog[:-4], rnn_type), 'wb') as csvfile:

        spamwriter = csv.writer(csvfile,
                                delimiter=',',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        # headers for the new file
        spamwriter.writerow([
            "Prefix length", "Ground truth", "Predicted", "Levenshtein",
            "Damerau", "Jaccard", "Ground truth times", "Predicted times",
            "RMSE", "MAE", "Median AE", "Ground Truth Group",
            "Predicted Group", "Levenshtein Group"
        ])

        # make predictions for different prefix sizes as specified in 'shared variables'
        for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to):
            print(prefix_size)

            lines_s, \
                lines_id_s, \
                lines_group_s, \
                lines_t_s, \
                lines_t2_s, \
                lines_t3_s, \
                lines_t4_s = select_declare_verified_traces(path_to_declare_model_file,
                                                            lines,
                                                            lines_id,
                                                            lines_group,
                                                            lines_t,
                                                            lines_t2,
                                                            lines_t3,
                                                            lines_t4,
                                                            prefix_size)

            print("prefix size: " + str(prefix_size))
            print("formulas verified: " + str(len(lines_s)) + " out of : " +
                  str(len(lines)))
            counterr = 0
            for line, line_id, line_group, times, times2, times3, times4 in zip(
                    lines_s, lines_id_s, lines_group_s, lines_t_s, lines_t2_s,
                    lines_t3_s, lines_t4_s):
                times.append(0)
                cropped_line_id = line_id
                cropped_line = ''.join(line[:prefix_size])
                cropped_line_group = ''.join(line_group[:prefix_size])
                cropped_times = times[:prefix_size]
                cropped_times3 = times3[:prefix_size]
                cropped_times4 = times4[:prefix_size]

                if len(times2) < prefix_size:
                    continue  # make no prediction for this case, since this case has ended already

                # initialize root of the tree for beam search
                total_predicted_time_initialization = 0
                search_node_root = NodePrediction(
                    encode(cropped_line, cropped_line_group, cropped_times,
                           cropped_times3, maxlen, chars, chars_group,
                           char_indices, char_indices_group, divisor,
                           divisor2), cropped_line_id, cropped_line,
                    cropped_line_group, cropped_times4,
                    total_predicted_time_initialization)

                ground_truth = ''.join(line[prefix_size:prefix_size +
                                            predict_size])
                ground_truth_group = ''.join(
                    line_group[prefix_size:prefix_size + predict_size])
                ground_truth_t = times2[prefix_size - 1]
                case_end_time = times2[len(times2) - 1]
                ground_truth_t = case_end_time - ground_truth_t

                queue_next_steps = PriorityQueue()
                queue_next_steps.put(
                    (-search_node_root.probability_of, search_node_root))

                queue_next_steps_future = PriorityQueue()
                start_of_the_cycle_symbol = " "
                found_sattisfying_constraint = False

                current_beam_size = beam_size
                current_prediction_premis = None

                for i in range(predict_size):
                    for k in range(current_beam_size):
                        if queue_next_steps.empty():
                            break

                        _, current_prediction_premis = queue_next_steps.get()

                        if not found_sattisfying_constraint:
                            if verify_formula_as_compliant(
                                    current_prediction_premis.cropped_line,
                                    formula, prefix_size):
                                # the formula verified and we can just finish the predictions
                                # beam size is 1 because predict only sequence of events
                                current_beam_size = 1
                                current_prediction_premis.probability_of = 0.0
                                # overwrite new queue
                                queue_next_steps_future = PriorityQueue()
                                found_sattisfying_constraint = True

                        enc = current_prediction_premis.data
                        temp_cropped_line = current_prediction_premis.cropped_line
                        y = model.predict(enc, verbose=0)  # make predictions
                        # split predictions into seperate activity and time predictions
                        y_char = y[0][0]
                        y_group = y[1][0]
                        y_t = y[2][0][0]

                        if y_t < 0:
                            y_t = 0
                        cropped_times.append(y_t)

                        if not i == 0:
                            stop_symbol_probability_amplifier_current, start_of_the_cycle_symbol = \
                                amplify(temp_cropped_line)

                        # in not reached, function :choose_next_top_descendant: will backtrack
                        y_t = y_t * divisor3
                        cropped_times3.append(cropped_times3[-1] +
                                              timedelta(seconds=y_t))

                        for j in range(current_beam_size):
                            temp_prediction = get_symbol_ampl(
                                y_char, target_indices_char,
                                target_char_indices, start_of_the_cycle_symbol,
                                stop_symbol_probability_amplifier_current, j)

                            temp_prediction_group = get_symbol_group(y_group)

                            # end of case was just predicted, therefore, stop predicting further into the future
                            if temp_prediction == '!':
                                if verify_formula_as_compliant(
                                        temp_cropped_line, formula,
                                        prefix_size):
                                    stop_symbol_probability_amplifier_current = 1
                                    print('! predicted, end case')
                                    queue_next_steps = PriorityQueue()
                                    break
                                else:
                                    continue

                            temp_cropped_line = current_prediction_premis.cropped_line + temp_prediction
                            temp_cropped_line_group = \
                                current_prediction_premis.cropped_line_group + temp_prediction_group

                            # adds a fake timestamp to the list
                            t = time.strptime(cropped_times4[-1],
                                              "%Y-%m-%d %H:%M:%S")
                            new_timestamp = datetime.fromtimestamp(
                                time.mktime(t)) + timedelta(0, 2000)
                            cropped_times4.append(
                                new_timestamp.strftime("%Y-%m-%d %H:%M:%S"))

                            temp_total_predicted_time = current_prediction_premis.total_predicted_time + y_t
                            temp_state_data = encode(
                                temp_cropped_line, temp_cropped_line_group,
                                cropped_times, cropped_times3, maxlen, chars,
                                chars_group, char_indices, char_indices_group,
                                divisor, divisor2)
                            probability_this = np.sort(y_char)[len(y_char) -
                                                               1 - j]

                            temp = NodePrediction(
                                temp_state_data, cropped_line_id,
                                temp_cropped_line, temp_cropped_line_group,
                                cropped_times4, temp_total_predicted_time,
                                current_prediction_premis.probability_of +
                                np.log(probability_this))

                            queue_next_steps_future.put(
                                (-temp.probability_of, temp))
                            print ('INFORMATION: ' + str(counterr) + ' ' + str(i) + ' ' + str(k) + ' ' + str(j) + ' ' + \
                                  temp_cropped_line[prefix_size:] + "     " + str(temp.probability_of))

                    queue_next_steps = queue_next_steps_future
                    queue_next_steps_future = PriorityQueue()

                counterr += 1

                if current_prediction_premis is None:
                    print(
                        "Cannot find any trace that is compliant with formula given current beam size"
                    )
                    break

                output = []

                if current_prediction_premis is None:
                    predicted = u""
                    predicted_group = u""
                    total_predicted_time = 0
                else:
                    predicted = (
                        current_prediction_premis.cropped_line[prefix_size:])
                    predicted_group = (current_prediction_premis.
                                       cropped_line_group[prefix_size:])
                    total_predicted_time = current_prediction_premis.total_predicted_time

                if len(ground_truth) > 0:
                    output.append(prefix_size)
                    output.append(unicode(ground_truth).encode("utf-8"))
                    output.append(unicode(predicted).encode("utf-8"))
                    output.append(
                        1 - distance.nlevenshtein(predicted, ground_truth))
                    dls = 1 - (damerau_levenshtein_distance(
                        unicode(predicted), unicode(ground_truth)) /
                               max(len(predicted), len(ground_truth)))
                    # we encountered problems with Damerau-Levenshtein Similarity on some
                    # linux machines where the default character encoding of the operating system
                    # caused it to be negative, this should never be the case
                    if dls < 0:
                        dls = 0
                    output.append(dls)
                    output.append(1 -
                                  distance.jaccard(predicted, ground_truth))
                    output.append(ground_truth_t)
                    output.append(total_predicted_time)
                    output.append('')
                    output.append(
                        metrics.mean_absolute_error([ground_truth_t],
                                                    [total_predicted_time]))
                    output.append(
                        metrics.median_absolute_error([ground_truth_t],
                                                      [total_predicted_time]))
                    output.append(unicode(ground_truth_group).encode("utf-8"))
                    output.append(unicode(predicted_group).encode("utf-8"))
                    output.append(1 - distance.nlevenshtein(
                        predicted_group, ground_truth_group))
                    spamwriter.writerow(output)

    print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
Esempio n. 28
0
def test_damerau_levenshtein_distance(jf, s1, s2, value):
    value = int(value)
    assert jf.damerau_levenshtein_distance(s1, s2) == value
def test(args, preprocess_manager):
    batch_size = args.batch_size_test
    result_dir = args.result_dir
    task = args.task

    if preprocess_manager.num_features_additional > 0:
        lines, caseids, lines_add, sequence_max_length, num_features_all, num_features_activities = preprocess_manager.create_test_set(
        )
    else:
        lines, caseids, sequence_max_length, num_features_all, num_features_activities = preprocess_manager.create_test_set(
        )

    model = keras.models.load_model(
        '%smodel_%s.h5' %
        (args.checkpoint_dir, preprocess_manager.iteration_cross_validation))

    predict_size = 1
    data_set_name = args.data_set.split('.csv')[0]
    generic_result_dir = result_dir + data_set_name + "__" + task
    fold_result_dir = generic_result_dir + "_%d%s" % (
        preprocess_manager.iteration_cross_validation, ".csv")
    result_dir = fold_result_dir

    with open(result_dir, 'w') as csvfile:
        spamwriter = csv.writer(csvfile,
                                delimiter=';',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow([
            "CaseID", "Prefix length", "Groud truth", "Predicted",
            "Levenshtein", "Damerau", "Jaccard"
        ])

        for prefix_size in range(2, sequence_max_length):
            util.llprint("\nPrefix size: %d\n" % prefix_size)

            # if additional attributes exists
            if preprocess_manager.num_features_additional > 0:

                for line, caseid, line_add in zip(lines, caseids, lines_add):

                    cropped_line = ''.join(line[:prefix_size])
                    cropped_line_add = line_add[:prefix_size]

                    if '!' in cropped_line:
                        continue

                    ground_truth = ''.join(line[prefix_size:prefix_size +
                                                predict_size])
                    predicted = ''

                    for i in range(predict_size):

                        if len(ground_truth) <= i:
                            continue

                        input_vec, num_features_all, num_features_activities = preprocess_manager.encode_test_set_add(
                            args, cropped_line, cropped_line_add, batch_size)
                        y = model.predict(input_vec, verbose=0)
                        y_char = y[0][:]
                        prediction = preprocess_manager.getSymbol(y_char)
                        cropped_line += prediction
                        predicted += prediction

                        if prediction == '!':
                            print('! predicted, end case')
                            break

                    output = []
                    if len(ground_truth) > 0:

                        output.append(caseid)
                        output.append(prefix_size)
                        output.append(str(ground_truth).encode("utf-8"))
                        output.append(str(predicted).encode("utf-8"))
                        output.append(
                            1 - distance.nlevenshtein(predicted, ground_truth))

                        dls = 1 - (damerau_levenshtein_distance(
                            str(predicted), str(ground_truth)) /
                                   max(len(predicted), len(ground_truth)))
                        if dls < 0:
                            dls = 0
                        output.append(dls)
                        output.append(
                            1 - distance.jaccard(predicted, ground_truth))
                        spamwriter.writerow(output)

            # if no additional attributes exists
            else:
                for line, caseid in zip(lines, caseids):

                    cropped_line = ''.join(line[:prefix_size])

                    if '!' in cropped_line:
                        continue

                    ground_truth = ''.join(line[prefix_size:prefix_size +
                                                predict_size])
                    predicted = ''

                    for i in range(predict_size):

                        if len(ground_truth) <= i:
                            continue

                        input_vec = preprocess_manager.encode_test_set(
                            cropped_line, batch_size)
                        y = model.predict(input_vec, verbose=0)
                        y_char = y[0][:]
                        prediction = preprocess_manager.getSymbol(y_char)
                        cropped_line += prediction
                        predicted += prediction

                        if prediction == '!':
                            print('! predicted, end case')
                            break

                    output = []
                    if len(ground_truth) > 0:

                        output.append(caseid)
                        output.append(prefix_size)
                        output.append(str(ground_truth).encode("utf-8"))
                        output.append(str(predicted).encode("utf-8"))
                        output.append(
                            1 - distance.nlevenshtein(predicted, ground_truth))

                        dls = 1 - (damerau_levenshtein_distance(
                            str(predicted), str(ground_truth)) /
                                   max(len(predicted), len(ground_truth)))
                        if dls < 0:
                            dls = 0
                        output.append(dls)
                        output.append(
                            1 - distance.jaccard(predicted, ground_truth))
                        spamwriter.writerow(output)