Example #1
0
def prepare_testing_data(eventlog):
    csvfile = open('../data/%s' % eventlog, 'r')
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers

    lastcase = ''
    line = ''
    first_line = True
    lines = []
    timeseqs = []  # relative time since previous event
    timeseqs2 = []  # relative time since case start
    timeseqs3 = []  # absolute time of previous event
    times = []
    times2 = []
    times3 = []
    numlines = 0
    casestarttime = None
    lasteventtime = None

    for row in spamreader:
        t = time.strptime(row[2], "%Y/%m/%d %H:%M:%S")
        if row[0] != lastcase:  # check if new case is starting
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not first_line:  # add case to list of cases
                lines.append(line)
                timeseqs.append(times)
                timeseqs2.append(times2)
                timeseqs3.append(times3)
            line = ''  # reinitialize case variables, because new case is starting
            times = []
            times2 = []
            times3 = []
            numlines += 1
        line += get_unicode_from_int(row[1])  # add unicode represantation to case variable
        timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(lasteventtime))
        timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(casestarttime))
        # midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0)
        # timesincemidnight = datetime.fromtimestamp(time.mktime(t)) - midnight
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
        timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
        times.append(timediff)
        times2.append(timediff2)
        times3.append(datetime.fromtimestamp(time.mktime(t)))
        lasteventtime = t
        first_line = False

    # add last case
    lines.append(line)
    timeseqs.append(times)
    timeseqs2.append(times2)
    timeseqs3.append(times3)
    numlines += 1

    divisor = np.mean([item for sublist in timeseqs for item in sublist])
    print('divisor: {}'.format(divisor))
    divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist])
    print('divisor2: {}'.format(divisor2))
    divisor3 = np.mean(map(lambda x: np.mean(map(lambda y: x[len(x) - 1] - y, x)), timeseqs2))
    print('divisor3: {}'.format(divisor3))

    elems_per_fold = int(round(numlines / 3))
    fold1and2lines = lines[:2 * elems_per_fold]
    fold1and2lines = map(lambda x: x + '!', fold1and2lines)
    maxlen = max(map(lambda x: len(x), fold1and2lines))

    chars = map(lambda x: set(x), fold1and2lines)
    chars = list(set().union(*chars))
    chars.sort()
    target_chars = copy.copy(chars)
    chars.remove('!')
    print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars)))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    target_char_indices = dict((c, i) for i, c in enumerate(target_chars))
    target_indices_char = dict((i, c) for i, c in enumerate(target_chars))
    print(indices_char)

    # we only need the third fold, because first two were used for training
    fold3 = lines[2 * elems_per_fold:]
    fold3_t = timeseqs[2 * elems_per_fold:]
    fold3_t2 = timeseqs2[2 * elems_per_fold:]
    fold3_t3 = timeseqs3[2 * elems_per_fold:]

    lines = fold3
    lines_t = fold3_t
    lines_t2 = fold3_t2
    lines_t3 = fold3_t3

    # set parameters
    predict_size = maxlen

    return lines, lines_t, lines_t2, lines_t3, maxlen, chars, char_indices, divisor, divisor2, divisor3, predict_size,\
        target_indices_char, target_char_indices
Example #2
0
def train_with_data():

    lines = []
    lines_group = []
    timeseqs = []
    timeseqs2 = []
    lastcase = ''
    line = ''
    line_group = ''
    first_line = True
    times = []
    times2 = []
    numlines = 0
    casestarttime = None
    lasteventtime = None

    csvfile = open('../data/final_experiments/%s' % eventlog, 'r')
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers

    for row in spamreader:
        t = time.strptime(row[2], "%Y/%m/%d %H:%M:%S")
        if row[0] != lastcase:
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not first_line:
                lines.append(line)
                lines_group.append(line_group)
                timeseqs.append(times)
                timeseqs2.append(times2)
            line = ''
            line_group = ''
            times = []
            times2 = []
            numlines += 1
        line += get_unicode_from_int(row[1])
        line_group += get_unicode_from_int(row[3])
        timesincelastevent = datetime.fromtimestamp(
            time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(lasteventtime))
        timesincecasestart = datetime.fromtimestamp(
            time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(casestarttime))
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
        timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
        times.append(timediff)
        times2.append(timediff2)
        lasteventtime = t
        first_line = False

    # add last case
    lines.append(line)
    lines_group.append(line_group)
    timeseqs.append(times)
    timeseqs2.append(times2)
    numlines += 1

    divisor = np.mean([item for sublist in timeseqs for item in sublist])
    print('divisor: {}'.format(divisor))
    divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist])
    print('divisor2: {}'.format(divisor2))

    elems_per_fold = int(round(numlines / 3))
    fold1 = lines[:elems_per_fold]
    fold1_group = lines_group[:elems_per_fold]
    fold1_t = timeseqs[:elems_per_fold]
    fold1_t2 = timeseqs2[:elems_per_fold]

    fold2 = lines[elems_per_fold:2 * elems_per_fold]
    fold2_group = lines_group[elems_per_fold:2 * elems_per_fold]
    fold2_t = timeseqs[elems_per_fold:2 * elems_per_fold]
    fold2_t2 = timeseqs2[elems_per_fold:2 * elems_per_fold]

    fold3 = lines[2 * elems_per_fold:]
    fold3_group = lines_group[2 * elems_per_fold:]
    fold3_t = timeseqs[2 * elems_per_fold:]
    fold3_t2 = timeseqs2[2 * elems_per_fold:]

    lines = fold1 + fold2
    lines_group = fold1_group + fold2_group
    lines_t = fold1_t + fold2_t
    lines_t2 = fold1_t2 + fold2_t2

    lines = map(lambda x: x + '!', lines)
    maxlen = max(map(lambda x: len(x), lines))

    chars = map(lambda x: set(x), lines)
    chars = list(set().union(*chars))
    chars.sort()
    target_chars = copy.copy(chars)
    chars.remove('!')
    print('total chars: {}, target chars: {}'.format(len(chars),
                                                     len(target_chars)))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    target_char_indices = dict((c, i) for i, c in enumerate(target_chars))

    # lines_group = map(lambda x: x+'!', lines_group)

    chars_group = map(lambda x: set(x), lines_group)
    chars_group = list(set().union(*chars_group))
    chars_group.sort()
    target_chars_group = copy.copy(chars_group)
    # chars_group.remove('!')
    print('total groups: {}, target groups: {}'.format(
        len(chars_group), len(target_chars_group)))
    char_indices_group = dict((c, i) for i, c in enumerate(chars_group))
    target_char_indices_group = dict(
        (c, i) for i, c in enumerate(target_chars_group))

    csvfile = open('../data/final_experiments/%s' % eventlog, 'r')
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers
    lastcase = ''
    line = ''
    line_group = ''
    first_line = True
    lines = []
    lines_group = []
    timeseqs = []
    timeseqs2 = []
    timeseqs3 = []
    timeseqs4 = []
    times = []
    times2 = []
    times3 = []
    times4 = []
    numlines = 0
    casestarttime = None
    lasteventtime = None
    for row in spamreader:
        t = time.strptime(row[2], "%Y/%m/%d %H:%M:%S")
        if row[0] != lastcase:
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not first_line:
                lines.append(line)
                lines_group.append(line_group)
                timeseqs.append(times)
                timeseqs2.append(times2)
                timeseqs3.append(times3)
                timeseqs4.append(times4)
            line = ''
            line_group = ''
            times = []
            times2 = []
            times3 = []
            times4 = []
            numlines += 1
        line += get_unicode_from_int(row[1])
        line_group += get_unicode_from_int(row[3])
        timesincelastevent = datetime.fromtimestamp(
            time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(lasteventtime))
        timesincecasestart = datetime.fromtimestamp(
            time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(casestarttime))
        midnight = datetime.fromtimestamp(time.mktime(t)).replace(
            hour=0, minute=0, second=0, microsecond=0)
        timesincemidnight = datetime.fromtimestamp(time.mktime(t)) - midnight
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
        timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
        timediff3 = timesincemidnight.seconds
        timediff4 = datetime.fromtimestamp(time.mktime(t)).weekday()
        times.append(timediff)
        times2.append(timediff2)
        times3.append(timediff3)
        times4.append(timediff4)
        lasteventtime = t
        first_line = False

    # add last case
    lines.append(line)
    lines_group.append(line_group)
    timeseqs.append(times)
    timeseqs2.append(times2)
    timeseqs3.append(times3)
    timeseqs4.append(times4)
    numlines += 1

    elems_per_fold = int(round(numlines / 3))
    fold1 = lines[:elems_per_fold]
    fold1_group = lines_group[:elems_per_fold]
    fold1_t = timeseqs[:elems_per_fold]
    fold1_t2 = timeseqs2[:elems_per_fold]
    fold1_t3 = timeseqs3[:elems_per_fold]
    fold1_t4 = timeseqs4[:elems_per_fold]
    with open('output_files/folds/fold1.csv', 'wb') as csvfile:
        spamwriter = csv.writer(csvfile,
                                delimiter=',',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        for row, timeseq in zip(fold1, fold1_t):
            spamwriter.writerow([
                unicode(s).encode("utf-8") + '#{}'.format(t)
                for s, t in zip(row, timeseq)
            ])

    fold2 = lines[elems_per_fold:2 * elems_per_fold]
    fold2_group = lines_group[elems_per_fold:2 * elems_per_fold]
    fold2_t = timeseqs[elems_per_fold:2 * elems_per_fold]
    fold2_t2 = timeseqs2[elems_per_fold:2 * elems_per_fold]
    fold2_t3 = timeseqs3[elems_per_fold:2 * elems_per_fold]
    fold2_t4 = timeseqs4[elems_per_fold:2 * elems_per_fold]
    with open('output_files/folds/fold2.csv', 'wb') as csvfile:
        spamwriter = csv.writer(csvfile,
                                delimiter=',',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        for row, timeseq in zip(fold2, fold2_t):
            spamwriter.writerow([
                unicode(s).encode("utf-8") + '#{}'.format(t)
                for s, t in zip(row, timeseq)
            ])

    fold3 = lines[2 * elems_per_fold:]
    fold3_group = lines_group[2 * elems_per_fold:]
    fold3_t = timeseqs[2 * elems_per_fold:]
    fold3_t2 = timeseqs2[2 * elems_per_fold:]
    fold3_t3 = timeseqs3[2 * elems_per_fold:]
    fold3_t4 = timeseqs4[2 * elems_per_fold:]
    with open('output_files/folds/fold3.csv', 'wb') as csvfile:
        spamwriter = csv.writer(csvfile,
                                delimiter=',',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        for row, timeseq in zip(fold3, fold3_t):
            spamwriter.writerow([
                unicode(s).encode("utf-8") + '#{}'.format(t)
                for s, t in zip(row, timeseq)
            ])

    lines = fold1 + fold2
    lines_group = fold1_group + fold2_group
    lines_t = fold1_t + fold2_t
    lines_t2 = fold1_t2 + fold2_t2
    lines_t3 = fold1_t3 + fold2_t3
    lines_t4 = fold1_t4 + fold2_t4

    step = 1
    sentences = []
    sentences_group = []
    softness = 0
    next_chars = []
    next_chars_group = []
    lines = map(lambda x: x + '!', lines)
    lines_group = map(lambda x: x + '!', lines_group)

    sentences_t = []
    sentences_t2 = []
    sentences_t3 = []
    sentences_t4 = []
    next_chars_t = []
    next_chars_t2 = []
    next_chars_t3 = []
    next_chars_t4 = []
    for line, line_group, line_t, line_t2, line_t3, line_t4 in zip(
            lines, lines_group, lines_t, lines_t2, lines_t3, lines_t4):
        for i in range(0, len(line), step):
            if i == 0:
                continue
            sentences.append(line[0:i])
            sentences_group.append(line_group[0:i])
            sentences_t.append(line_t[0:i])
            sentences_t2.append(line_t2[0:i])
            sentences_t3.append(line_t3[0:i])
            sentences_t4.append(line_t4[0:i])
            next_chars.append(line[i])
            next_chars_group.append(line_group[i])
            if i == len(
                    line) - 1:  # special case to deal time of end character
                next_chars_t.append(0)
                next_chars_t2.append(0)
                next_chars_t3.append(0)
                next_chars_t4.append(0)
            else:
                next_chars_t.append(line_t[i])
                next_chars_t2.append(line_t2[i])
                next_chars_t3.append(line_t3[i])
                next_chars_t4.append(line_t4[i])
    print('nb sequences:', len(sentences))

    print('Vectorization...')
    num_features = len(chars) + len(chars_group) + 5
    print('num features: {}'.format(num_features))
    print('MaxLen: ', maxlen)
    X = np.zeros((len(sentences), maxlen, num_features), dtype=np.float32)
    y_a = np.zeros((len(sentences), len(target_chars)), dtype=np.float32)
    y_g = np.zeros((len(sentences), len(target_chars_group)), dtype=np.float32)
    y_t = np.zeros((len(sentences)), dtype=np.float32)
    for i, sentence in enumerate(sentences):
        leftpad = maxlen - len(sentence)
        next_t = next_chars_t[i]
        sentence_group = sentences_group[i]
        sentence_t = sentences_t[i]
        sentence_t2 = sentences_t2[i]
        sentence_t3 = sentences_t3[i]
        sentence_t4 = sentences_t4[i]
        for t, char in enumerate(sentence):
            multiset_abstraction = Counter(sentence[:t + 1])
            for c in chars:
                if c == char:
                    X[i, t + leftpad, char_indices[c]] = 1
            for g in chars_group:
                if g == sentence_group[t]:
                    X[i, t + leftpad, len(chars) + char_indices_group[g]] = 1
            X[i, t + leftpad, len(chars) + len(chars_group)] = t + 1
            X[i, t + leftpad,
              len(chars) + len(chars_group) + 1] = sentence_t[t] / divisor
            X[i, t + leftpad,
              len(chars) + len(chars_group) + 2] = sentence_t2[t] / divisor2
            X[i, t + leftpad,
              len(chars) + len(chars_group) + 3] = sentence_t3[t] / 86400
            X[i, t + leftpad,
              len(chars) + len(chars_group) + 4] = sentence_t4[t] / 7
        for c in target_chars:
            if c == next_chars[i]:
                y_a[i, target_char_indices[c]] = 1 - softness
            else:
                y_a[i,
                    target_char_indices[c]] = softness / (len(target_chars) -
                                                          1)
        for g in target_chars_group:
            if g == next_chars_group[i]:
                y_g[i, target_char_indices_group[g]] = 1 - softness
            else:
                y_g[i, target_char_indices_group[g]] = softness / (
                    len(target_chars_group) - 1)
        y_t[i] = next_t / divisor
        np.set_printoptions(threshold=np.nan)

    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.5
    set_session(tf.Session(config=config))

    # build the model:
    print('Build model...')
    main_input = Input(shape=(maxlen, num_features), name='main_input')
    # train a 2-layer LSTM with one shared layer
    # the shared layer
    l1 = LSTM(100,
              consume_less='gpu',
              init='glorot_uniform',
              return_sequences=True,
              dropout_W=0.2)(main_input)
    b1 = BatchNormalization()(l1)

    # the layer specialized in activity prediction
    l2_1 = LSTM(100,
                consume_less='gpu',
                init='glorot_uniform',
                return_sequences=False,
                dropout_W=0.2)(b1)
    b2_1 = BatchNormalization()(l2_1)

    # the layer specialized in time prediction
    l2_2 = LSTM(100,
                consume_less='gpu',
                init='glorot_uniform',
                return_sequences=False,
                dropout_W=0.2)(b1)
    b2_2 = BatchNormalization()(l2_2)

    # the layer specialized in resource prediction
    l2_3 = LSTM(100,
                consume_less='gpu',
                init='glorot_uniform',
                return_sequences=False,
                dropout_W=0.2)(b1)
    b2_3 = BatchNormalization()(l2_3)

    act_output = Dense(len(target_chars),
                       activation='softmax',
                       init='glorot_uniform',
                       name='act_output')(b2_1)
    group_output = Dense(len(target_chars_group),
                         activation='softmax',
                         init='glorot_uniform',
                         name='group_output')(b2_3)
    time_output = Dense(1, init='glorot_uniform', name='time_output')(b2_2)

    model = Model(input=[main_input],
                  output=[act_output, group_output, time_output])
    opt = Nadam(lr=0.002,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=1e-08,
                schedule_decay=0.004,
                clipvalue=3)
    model.compile(loss={
        'act_output': 'categorical_crossentropy',
        'group_output': 'categorical_crossentropy',
        'time_output': 'mae'
    },
                  optimizer=opt)
    early_stopping = EarlyStopping(monitor='val_loss', patience=42)
    path_to_model = 'output_files/final_experiments/models/CFR/' + eventlog[:-4] + \
                    '/model_{epoch:02d}-{val_loss:.2f}.h5'
    model_checkpoint = ModelCheckpoint(path_to_model,
                                       monitor='val_loss',
                                       verbose=0,
                                       save_best_only=True,
                                       save_weights_only=False,
                                       mode='auto')
    lr_reducer = ReduceLROnPlateau(monitor='val_loss',
                                   factor=0.5,
                                   patience=10,
                                   verbose=0,
                                   mode='auto',
                                   epsilon=0.0001,
                                   cooldown=0,
                                   min_lr=0)

    model.fit(X, {
        'act_output': y_a,
        'time_output': y_t,
        'group_output': y_g
    },
              validation_split=0.2,
              verbose=2,
              callbacks=[early_stopping, model_checkpoint, lr_reducer],
              batch_size=maxlen,
              nb_epoch=300)
Example #3
0
def prepare_testing_data(eventlog):
    csvfile = open('../data/final_experiments/%s' % eventlog, 'r')
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers

    lastcase = ''
    line = ''
    line_group = ''
    first_line = True
    lines_id = []
    lines = []
    lines_group = []
    timeseqs = []  # relative time since previous event
    timeseqs2 = []  # relative time since case start
    timeseqs3 = []  # absolute time of previous event
    timeseqs4 = []  # absolute time of event as a string
    times = []
    times2 = []
    times3 = []
    times4 = []
    numlines = 0
    casestarttime = None
    lasteventtime = None

    for row in spamreader:
        t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S")
        if row[0] != lastcase:
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not first_line:
                lines.append(line)
                lines_group.append(line_group)
                timeseqs.append(times)
                timeseqs2.append(times2)
                timeseqs3.append(times3)
                timeseqs4.append(times4)
            lines_id.append(lastcase)
            line = ''
            line_group = ''
            times = []
            times2 = []
            times3 = []
            times4 = []
            numlines += 1
        line += get_unicode_from_int(row[1])
        line_group += get_unicode_from_int(row[3])
        timesincelastevent = datetime.fromtimestamp(
            time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(lasteventtime))
        timesincecasestart = datetime.fromtimestamp(
            time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(casestarttime))
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
        timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
        times.append(timediff)
        times2.append(timediff2)
        times3.append(datetime.fromtimestamp(time.mktime(t)))
        times4.append(row[2])
        lasteventtime = t
        first_line = False

    # add last case
    lines.append(line)
    lines_group.append(line_group)
    timeseqs.append(times)
    timeseqs2.append(times2)
    timeseqs3.append(times3)
    timeseqs4.append(times4)
    numlines += 1

    divisor = np.mean([item for sublist in timeseqs for item in sublist])
    divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist])
    divisor3 = np.mean(
        map(lambda x: np.mean(map(lambda y: x[len(x) - 1] - y, x)), timeseqs2))

    elems_per_fold = int(round(numlines / 3))

    fold1and2lines = lines[:2 * elems_per_fold]
    fold1and2lines = map(lambda x: x + '!', fold1and2lines)
    maxlen = max(map(lambda x: len(x), fold1and2lines))
    chars = map(lambda x: set(x), fold1and2lines)
    chars = list(set().union(*chars))
    chars.sort()
    target_chars = copy.copy(chars)
    chars.remove('!')
    char_indices = dict((c, i) for i, c in enumerate(chars))
    target_char_indices = dict((c, i) for i, c in enumerate(target_chars))
    target_indices_char = dict((i, c) for i, c in enumerate(target_chars))

    fold1and2lines_group = lines_group[:2 * elems_per_fold]
    # fold1and2lines_group = map(lambda x: x + '!', fold1and2lines_group)
    chars_group = map(lambda x: set(x), fold1and2lines_group)
    chars_group = list(set().union(*chars_group))
    chars_group.sort()
    target_chars_group = copy.copy(chars_group)
    # chars_group.remove('!')
    char_indices_group = dict((c, i) for i, c in enumerate(chars_group))
    target_char_indices_group = dict(
        (c, i) for i, c in enumerate(target_chars_group))
    target_indices_char_group = dict(
        (i, c) for i, c in enumerate(target_chars_group))

    # we only need the third fold, because first two were used for training
    fold3 = lines[2 * elems_per_fold:]
    fold3_id = lines_id[2 * elems_per_fold:]
    fold3_group = lines_group[2 * elems_per_fold:]
    fold3_t = timeseqs[2 * elems_per_fold:]
    fold3_t2 = timeseqs2[2 * elems_per_fold:]
    fold3_t3 = timeseqs3[2 * elems_per_fold:]
    fold3_t4 = timeseqs4[2 * elems_per_fold:]

    lines = fold3
    lines_id = fold3_id
    lines_group = fold3_group
    lines_t = fold3_t
    lines_t2 = fold3_t2
    lines_t3 = fold3_t3
    lines_t4 = fold3_t4

    # set parameters
    predict_size = maxlen

    return lines, \
        lines_id, \
        lines_group, \
        lines_t, \
        lines_t2, \
        lines_t3, \
        lines_t4, \
        maxlen, \
        chars, \
        chars_group, \
        char_indices, \
        char_indices_group, \
        divisor, \
        divisor2, \
        divisor3, \
        predict_size, \
        target_indices_char, \
        target_indices_char_group, \
        target_char_indices, \
        target_char_indices_group
    def train(log_name, models_folder, use_old_model):
        lines = []
        lines_group = []
        lines_time = []
        timeseqs = []
        timeseqs2 = []
        lastcase = ''
        line = ''
        line_group = ''
        line_time = ''
        first_line = True
        times = []
        times2 = []
        difflist = []
        numlines = 0
        casestarttime = None
        lasteventtime = None

        r = 3

        csvfile = open(shared_variables.data_folder + '%s.csv' % log_name, 'r')
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
        next(spamreader, None)  # skip the headers

        for row in spamreader:
            t1 = time.strptime(row[2], "%Y-%m-%d %H:%M:%S")
            if row[0] != lastcase:
                lastevent = t1
                lastcase = row[0]
            if row[1] != '0':
                t2 = datetime.fromtimestamp(time.mktime(t1)) - datetime.fromtimestamp(time.mktime(lastevent))
                tdiff = 86400 * t2.days + t2.seconds
            else:
                tdiff = 0
            difflist.append(tdiff)
            lastevent = t1

        difflist = [int(i) for i in difflist]
        maxdiff = max(difflist)
        difflist[np.argmax(difflist)] -= 1e-8
        diff = maxdiff / r
        # difflist.sort()
        # mediandiff = np.percentile(difflist, 50)
        # diff = mediandiff / r

        # print(maxdiff)
        # print(mediandiff)
        csvfile.seek(0)
        next(spamreader, None)  # skip the headers

        line_index = 0
        for row in spamreader:
            t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S")
            if row[0] != lastcase:
                casestarttime = t
                lasteventtime = t
                lastcase = row[0]
                if not first_line:
                    lines.append(line)
                    lines_group.append(line_group)
                    lines_time.append(line_time)
                    timeseqs.append(times)
                    timeseqs2.append(times2)
                line = ''
                line_group = ''
                line_time = ''
                times = []
                times2 = []
                numlines += 1
            line += get_unicode_from_int(row[1])
            line_group += get_unicode_from_int(row[3])

            # if (difflist[line_index] / diff) <= r:
            #     line_time += get_unicode_from_int(int(int(row[4]) / diff))
            # else:
            line_time += get_unicode_from_int(int(difflist[line_index] / diff))
            timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(lasteventtime))
            timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(casestarttime))
            timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
            timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
            times.append(timediff)
            times2.append(timediff2)
            lasteventtime = t
            first_line = False
            line_index += 1
        # add last case
        lines.append(line)
        lines_group.append(line_group)
        lines_time.append(line_time)
        timeseqs.append(times)
        timeseqs2.append(times2)
        numlines += 1

        divisor = np.max([item for sublist in timeseqs for item in sublist])
        print('divisor: {}'.format(divisor))
        divisor2 = np.max([item for sublist in timeseqs2 for item in sublist])
        print('divisor2: {}'.format(divisor2))

        elements_per_fold = int(round(numlines / 3))
        fold1 = lines[:elements_per_fold]
        fold1_group = lines_group[:elements_per_fold]
        fold1_time = lines_time[:elements_per_fold]

        fold2 = lines[elements_per_fold:2 * elements_per_fold]
        fold2_group = lines_group[elements_per_fold:2 * elements_per_fold]
        fold2_time = lines_time[elements_per_fold:2 * elements_per_fold]

        lines = fold1 + fold2
        lines_group = fold1_group + fold2_group
        lines_time = fold1_time + fold2_time

        lines = map(lambda x: x + '!', lines)
        maxlen = max(map(lambda x: len(x), lines))

        chars = map(lambda x: set(x), lines)
        chars = list(set().union(*chars))
        chars.sort()
        target_chars = copy.copy(chars)
        chars.remove('!')
        print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars)))
        char_indices = dict((c, i) for i, c in enumerate(chars))
        target_char_indices = dict((c, i) for i, c in enumerate(target_chars))

        # lines_group = map(lambda x: x+'!', lines_group)

        chars_group = map(lambda x: set(x), lines_group)
        chars_group = list(set().union(*chars_group))
        chars_group.sort()
        target_chars_group = copy.copy(chars_group)
        # chars_group.remove('!')
        print('total groups: {}, target groups: {}'.format(len(chars_group), len(target_chars_group)))
        char_indices_group = dict((c, i) for i, c in enumerate(chars_group))
        target_char_indices_group = dict((c, i) for i, c in enumerate(target_chars_group))

        chars_time = map(lambda x: set(x), lines_time)
        chars_time = list(set().union(*chars_time))
        chars_time.sort()
        target_chars_time = copy.copy(chars_time)

        print('total times: {}, target times: {}'.format(len(chars_time), len(target_chars_time)))
        char_indices_time = dict((c, i) for i, c in enumerate(chars_time))
        target_char_indices_time = dict((c, i) for i, c in enumerate(target_chars_time))

        csvfile = open(shared_variables.data_folder + '%s.csv' % log_name, 'r')
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
        next(spamreader, None)  # skip the headers
        lastcase = ''
        line = ''
        line_group = ''
        line_time = ''
        first_line = True
        lines_id = []
        lines = []
        lines_group = []
        lines_time = []
        timeseqs = []  # relative time since previous event
        timeseqs2 = []  # relative time since case start
        timeseqs3 = []  # absolute time of previous event
        timeseqs4 = []  # absolute time of event as a string
        times = []
        times2 = []
        times3 = []
        times4 = []
        numlines = 0
        casestarttime = None
        lasteventtime = None
        line_index = 0
        for row in spamreader:
            t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S")
            if row[0] != lastcase:
                casestarttime = t
                lasteventtime = t
                lastcase = row[0]
                if not first_line:
                    lines.append(line)
                    lines_group.append(line_group)
                    lines_time.append(line_time)
                    timeseqs.append(times)
                    timeseqs2.append(times2)
                    timeseqs3.append(times3)
                    timeseqs4.append(times4)
                lines_id.append(lastcase)
                line = ''
                line_group = ''
                times = []
                times2 = []
                times3 = []
                times4 = []
                numlines += 1
            line += get_unicode_from_int(row[1])
            line_group += get_unicode_from_int(row[3])
            line_time += get_unicode_from_int(int(difflist[line_index] / diff))
            timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(lasteventtime))
            timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(casestarttime))
            midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0)
            timesincemidnight = datetime.fromtimestamp(time.mktime(t)) - midnight
            timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
            timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
            timediff3 = timesincemidnight.seconds
            timediff4 = datetime.fromtimestamp(time.mktime(t)).weekday()
            times.append(timediff)
            times2.append(timediff2)
            times3.append(timediff3)
            times4.append(timediff4)
            lasteventtime = t
            first_line = False
            line_index += 1

        # add last case
        lines.append(line)
        lines_group.append(line_group)
        lines_time.append(line_time)
        timeseqs.append(times)
        timeseqs2.append(times2)
        timeseqs3.append(times3)
        timeseqs4.append(times4)
        numlines += 1

        elements_per_fold = int(round(numlines / 3))

        lines = lines[:-elements_per_fold]
        lines_group = lines_group[:-elements_per_fold]
        lines_time = lines_time[:-elements_per_fold]
        lines_t = timeseqs[:-elements_per_fold]
        lines_t2 = timeseqs2[:-elements_per_fold]
        lines_t3 = timeseqs3[:-elements_per_fold]
        lines_t4 = timeseqs4[:-elements_per_fold]

        step = 1
        sentences = []
        sentences_group = []
        sentences_time = []
        softness = 0
        next_chars = []
        next_chars_group = []
        next_chars_time = []
        lines = map(lambda x: x + '!', lines)
        lines_group = map(lambda x: x + '!', lines_group)
        lines_time = map(lambda x: x + '!', lines_time)

        sentences_t = []
        sentences_t2 = []
        sentences_t3 = []
        sentences_t4 = []
        next_chars_t = []
        for line, line_group, line_time, line_t, line_t2, line_t3, line_t4 in izip(lines, lines_group, lines_time,
                                                                                   lines_t, lines_t2, lines_t3,
                                                                                   lines_t4):
            for i in range(0, len(line), step):
                if i == 0:
                    continue
                sentences.append(line[0: i])
                sentences_group.append(line_group[0:i])
                sentences_time.append(line_time[0:i])
                sentences_t.append(line_t[0:i])
                sentences_t2.append(line_t2[0:i])
                sentences_t3.append(line_t3[0:i])
                sentences_t4.append(line_t4[0:i])
                next_chars.append(line[i])
                next_chars_group.append(line_group[i])
                next_chars_time.append(line_time[i])
                if i == len(line) - 1:  # special case to deal time of end character
                    next_chars_t.append(0)
                else:
                    next_chars_t.append(line_t[i])
        print('nb sequences:', len(sentences))

        print('Vectorization...')
        num_features = len(chars) + len(chars_group) + len(chars_time) + 5
        print('num features: {}'.format(num_features))
        print('MaxLen: ', maxlen)
        X = np.zeros((len(sentences), maxlen, num_features), dtype=np.float32)
        y_a = np.zeros((len(sentences), len(target_chars)), dtype=np.float32)
        y_g = np.zeros((len(sentences), len(target_chars_group)), dtype=np.float32)
        y_t = np.zeros((len(sentences)), dtype=np.float32)
        y_y = np.zeros((len(sentences), len(target_chars_time)), dtype=np.float32)
        for i, sentence in enumerate(sentences):
            leftpad = maxlen - len(sentence)
            next_t = next_chars_t[i]
            sentence_group = sentences_group[i]
            sentence_time = sentences_time[i]
            sentence_t = sentences_t[i]
            sentence_t2 = sentences_t2[i]
            sentence_t3 = sentences_t3[i]
            sentence_t4 = sentences_t4[i]
            for t, char in enumerate(sentence):
                for c in chars:
                    if c == char:
                        X[i, t + leftpad, char_indices[c]] = 1
                for g in chars_group:
                    if g == sentence_group[t]:
                        X[i, t + leftpad, len(chars) + char_indices_group[g]] = 1
                for y in chars_time:
                    if y == sentence_time[t]:
                        X[i, t + leftpad, len(chars) + len(chars_group) + char_indices_time[y]] = 1
                X[i, t + leftpad, len(chars) + len(chars_group) + len(chars_time)] = t + 1
                X[i, t + leftpad, len(chars) + len(chars_group) + len(chars_time) + 1] = sentence_t[t] / divisor
                X[i, t + leftpad, len(chars) + len(chars_group) + len(chars_time) + 2] = sentence_t2[t] / divisor2
                X[i, t + leftpad, len(chars) + len(chars_group) + len(chars_time) + 3] = sentence_t3[t] / 86400
                X[i, t + leftpad, len(chars) + len(chars_group) + len(chars_time) + 4] = sentence_t4[t] / 7
            for c in target_chars:
                if c == next_chars[i]:
                    y_a[i, target_char_indices[c]] = 1 - softness
                else:
                    y_a[i, target_char_indices[c]] = softness / (len(target_chars) - 1)
            for g in target_chars_group:
                if g == next_chars_group[i]:
                    y_g[i, target_char_indices_group[g]] = 1 - softness
                else:
                    y_g[i, target_char_indices_group[g]] = softness / (len(target_chars_group) - 1)
            for y in target_chars_time:
                if y == next_chars_time[i]:
                    y_y[i, target_char_indices_time[y]] = 1 - softness
                else:
                    y_y[i, target_char_indices_time[y]] = softness / (len(target_chars_time) - 1)
            y_t[i] = next_t / divisor

        for fold in range(folds):
            model = TrainCFRT._build_model(maxlen, num_features, target_chars, target_chars_time, target_chars_group,
                                           use_old_model)
            checkpoint_name = create_checkpoints_path(log_name, models_folder, fold, 'CFRT')
            TrainCFRT._train_model(model, checkpoint_name, X, y_a, y_t, y_y, y_g)
    def train(log_name, models_folder, use_old_model):
        # TrainCF._load_dataset(log_name)

        lines = []  # list of all the activity sequences
        timeseqs = []  # time sequences (differences between two events)
        timeseqs2 = []  # time sequences (differences between the current and first)

        # helper variables
        last_case = ''
        line = ''  # sequence of activities for one case
        first_line = True
        times = []
        times2 = []
        num_lines = 0
        case_start_time = None
        last_event_time = None

        csvfile = open(shared_variables.data_folder + '%s.csv' % log_name, 'r')
        csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        next(csv_reader, None)  # skip the headers

        for row in csv_reader:  # the rows are "CaseID,ActivityID,CompleteTimestamp"
            t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S")  # creates a datetime object from row[2]
            if row[0] != last_case:  # 'last_case' is to save the last executed case for the loop
                case_start_time = t
                last_event_time = t
                last_case = row[0]
                if not first_line:  # here we actually add the sequences to the lists
                    lines.append(line)
                    timeseqs.append(times)
                    timeseqs2.append(times2)
                line = ''
                times = []
                times2 = []
                num_lines += 1
            line += get_unicode_from_int(row[1])
            time_since_last_event = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(last_event_time))
            time_since_case_start = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(case_start_time))
            time_diff = 86400 * time_since_last_event.days + time_since_last_event.seconds
            time_diff2 = 86400 * time_since_case_start.days + time_since_case_start.seconds
            times.append(time_diff)
            times2.append(time_diff2)
            last_event_time = t
            first_line = False

        # add last case
        lines.append(line)
        timeseqs.append(times)
        timeseqs2.append(times2)
        num_lines += 1

        divisor = 1.0 * np.max([item for sublist in timeseqs for item in sublist])  # average time between events
        print('divisor: {}'.format(divisor))
        divisor2 = 1.0 * np.max([item for sublist in timeseqs2 for item in sublist])  # average time between current and
        # first events
        print('divisor2: {}'.format(divisor2))

        # separate training data into 2(out of 3) parts
        elements_per_fold = int(round(num_lines / 3))

        many = 0
        for i in range(len(lines)):
            many = many + len(lines[i])

        get_lengths = lambda x : [len(a) for a in x]
        print("mean length of the trace: ", np.mean(get_lengths(lines)))
        print("median length of the trace: ", np.median(get_lengths(lines)))
        print("number of traces: ", len(lines))

        fold1 = lines[:elements_per_fold]
        fold2 = lines[elements_per_fold:2 * elements_per_fold]
        lines = fold1 + fold2

        lines = map(lambda x: x + '!', lines)  # put delimiter symbol
        maxlen = max(map(lambda x: len(x), lines))  # find maximum line size

        # next lines here to get all possible characters for events and annotate them with numbers
        chars = map(lambda x: set(x), lines)  # remove duplicate activities from each separate case
        chars = list(set().union(*chars))  # creates a list of all the unique activities in the data set
        chars.sort()  # sorts the chars in alphabetical order
        target_chars = copy.copy(chars)
        chars.remove('!')
        print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars)))
        char_indices = dict((c, i) for i, c in enumerate(chars))
        target_char_indices = dict((c, i) for i, c in enumerate(target_chars))

        csvfile = open(shared_variables.data_folder + '%s.csv' % log_name, 'r')
        csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        next(csv_reader, None)  # skip the headers
        last_case = ''
        line = ''
        first_line = True
        lines = []
        timeseqs = []
        timeseqs2 = []
        timeseqs3 = []
        timeseqs4 = []
        times = []
        times2 = []
        times3 = []
        times4 = []
        num_lines = 0
        case_start_time = None
        last_event_time = None
        for row in csv_reader:
            t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S")
            # new case starts
            if row[0] != last_case:
                case_start_time = t
                last_event_time = t
                last_case = row[0]
                if not first_line:
                    lines.append(line)
                    timeseqs.append(times)
                    timeseqs2.append(times2)
                    timeseqs3.append(times3)
                    timeseqs4.append(times4)
                line = ''
                times = []
                times2 = []
                times3 = []
                times4 = []
                num_lines += 1
            line += get_unicode_from_int(row[1])
            time_since_last_event = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(last_event_time))
            time_since_case_start = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(
                time.mktime(case_start_time))
            midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0)
            timesincemidnight = datetime.fromtimestamp(time.mktime(t)) - midnight
            time_diff = 86400 * time_since_last_event.days + time_since_last_event.seconds
            time_diff2 = 86400 * time_since_case_start.days + time_since_case_start.seconds
            timediff3 = timesincemidnight.seconds  # this leaves only time even occurred after midnight
            timediff4 = datetime.fromtimestamp(time.mktime(t)).weekday()  # day of the week
            times.append(time_diff)
            times2.append(time_diff2)
            times3.append(timediff3)
            times4.append(timediff4)
            last_event_time = t
            first_line = False

        # add last case
        lines.append(line)
        timeseqs.append(times)
        timeseqs2.append(times2)
        timeseqs3.append(times3)
        timeseqs4.append(times4)
        num_lines += 1

        elements_per_fold = int(round(num_lines / 3))

        lines = lines[:-elements_per_fold]
        lines_t = timeseqs[:-elements_per_fold]
        lines_t2 = timeseqs2[:-elements_per_fold]
        lines_t3 = timeseqs3[:-elements_per_fold]
        lines_t4 = timeseqs4[:-elements_per_fold]

        step = 1
        sentences = []
        softness = 0
        next_chars = []
        lines = map(lambda x: x + '!', lines)

        sentences_t = []
        sentences_t2 = []
        sentences_t3 = []
        sentences_t4 = []
        next_chars_t = []
        for line, line_t, line_t2, line_t3, line_t4 in izip(lines, lines_t, lines_t2, lines_t3, lines_t4):
            for i in range(0, len(line), step):
                if i == 0:
                    continue

                # we add iteratively, first symbol of the line, then two first, three...
                sentences.append(line[0: i])
                sentences_t.append(line_t[0:i])
                sentences_t2.append(line_t2[0:i])
                sentences_t3.append(line_t3[0:i])
                sentences_t4.append(line_t4[0:i])
                next_chars.append(line[i])
                if i == len(line) - 1:  # special case to deal time of end character
                    next_chars_t.append(0)
                else:
                    next_chars_t.append(line_t[i])
        print('nb sequences:', len(sentences))
        print('Vectorization...')
        num_features = len(chars) + 5
        print('num features: {}'.format(num_features))
        X = np.zeros((len(sentences), maxlen, num_features), dtype=np.float32)
        y_a = np.zeros((len(sentences), len(target_chars)), dtype=np.float32)
        y_t = np.zeros((len(sentences)), dtype=np.float32)
        for i, sentence in enumerate(sentences):
            leftpad = maxlen - len(sentence)
            next_t = next_chars_t[i]
            sentence_t = sentences_t[i]
            sentence_t2 = sentences_t2[i]
            sentence_t3 = sentences_t3[i]
            sentence_t4 = sentences_t4[i]
            for t, char in enumerate(sentence):
                # multiset_abstraction = Counter(sentence[:t+1])
                for c in chars:
                    if c == char:  # this will encode present events to the right places
                        X[i, t + leftpad, char_indices[c]] = 1
                X[i, t + leftpad, len(chars)] = t + 1
                X[i, t + leftpad, len(chars) + 1] = sentence_t[t] / divisor
                X[i, t + leftpad, len(chars) + 2] = sentence_t2[t] / divisor2
                X[i, t + leftpad, len(chars) + 3] = sentence_t3[t] / 86400
                X[i, t + leftpad, len(chars) + 4] = sentence_t4[t] / 7
            for c in target_chars:
                if c == next_chars[i]:
                    y_a[i, target_char_indices[c]] = 1 - softness
                else:
                    y_a[i, target_char_indices[c]] = softness / (len(target_chars) - 1)
            y_t[i] = next_t / divisor

        for fold in range(folds):
            # model = build_model(max_length, num_features, max_activity_id)
            model = TrainCF._build_model(maxlen, num_features, target_chars, use_old_model)
            checkpoint_name = create_checkpoints_path(log_name, models_folder, fold, 'CF')
            TrainCF._train_model(model, checkpoint_name, X, y_a, y_t)