def prepare_testing_data(eventlog): csvfile = open('../data/%s' % eventlog, 'r') spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') next(spamreader, None) # skip the headers lastcase = '' line = '' first_line = True lines = [] timeseqs = [] # relative time since previous event timeseqs2 = [] # relative time since case start timeseqs3 = [] # absolute time of previous event times = [] times2 = [] times3 = [] numlines = 0 casestarttime = None lasteventtime = None for row in spamreader: t = time.strptime(row[2], "%Y/%m/%d %H:%M:%S") if row[0] != lastcase: # check if new case is starting casestarttime = t lasteventtime = t lastcase = row[0] if not first_line: # add case to list of cases lines.append(line) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) line = '' # reinitialize case variables, because new case is starting times = [] times2 = [] times3 = [] numlines += 1 line += get_unicode_from_int(row[1]) # add unicode represantation to case variable timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(lasteventtime)) timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(casestarttime)) # midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0) # timesincemidnight = datetime.fromtimestamp(time.mktime(t)) - midnight timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds times.append(timediff) times2.append(timediff2) times3.append(datetime.fromtimestamp(time.mktime(t))) lasteventtime = t first_line = False # add last case lines.append(line) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) numlines += 1 divisor = np.mean([item for sublist in timeseqs for item in sublist]) print('divisor: {}'.format(divisor)) divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist]) print('divisor2: {}'.format(divisor2)) divisor3 = np.mean(map(lambda x: np.mean(map(lambda y: x[len(x) - 1] - y, x)), timeseqs2)) print('divisor3: {}'.format(divisor3)) elems_per_fold = int(round(numlines / 3)) fold1and2lines = lines[:2 * elems_per_fold] fold1and2lines = map(lambda x: x + '!', fold1and2lines) maxlen = max(map(lambda x: len(x), fold1and2lines)) chars = map(lambda x: set(x), fold1and2lines) chars = list(set().union(*chars)) chars.sort() target_chars = copy.copy(chars) chars.remove('!') print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars))) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) target_char_indices = dict((c, i) for i, c in enumerate(target_chars)) target_indices_char = dict((i, c) for i, c in enumerate(target_chars)) print(indices_char) # we only need the third fold, because first two were used for training fold3 = lines[2 * elems_per_fold:] fold3_t = timeseqs[2 * elems_per_fold:] fold3_t2 = timeseqs2[2 * elems_per_fold:] fold3_t3 = timeseqs3[2 * elems_per_fold:] lines = fold3 lines_t = fold3_t lines_t2 = fold3_t2 lines_t3 = fold3_t3 # set parameters predict_size = maxlen return lines, lines_t, lines_t2, lines_t3, maxlen, chars, char_indices, divisor, divisor2, divisor3, predict_size,\ target_indices_char, target_char_indices
def train_with_data(): lines = [] lines_group = [] timeseqs = [] timeseqs2 = [] lastcase = '' line = '' line_group = '' first_line = True times = [] times2 = [] numlines = 0 casestarttime = None lasteventtime = None csvfile = open('../data/final_experiments/%s' % eventlog, 'r') spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') next(spamreader, None) # skip the headers for row in spamreader: t = time.strptime(row[2], "%Y/%m/%d %H:%M:%S") if row[0] != lastcase: casestarttime = t lasteventtime = t lastcase = row[0] if not first_line: lines.append(line) lines_group.append(line_group) timeseqs.append(times) timeseqs2.append(times2) line = '' line_group = '' times = [] times2 = [] numlines += 1 line += get_unicode_from_int(row[1]) line_group += get_unicode_from_int(row[3]) timesincelastevent = datetime.fromtimestamp( time.mktime(t)) - datetime.fromtimestamp( time.mktime(lasteventtime)) timesincecasestart = datetime.fromtimestamp( time.mktime(t)) - datetime.fromtimestamp( time.mktime(casestarttime)) timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds times.append(timediff) times2.append(timediff2) lasteventtime = t first_line = False # add last case lines.append(line) lines_group.append(line_group) timeseqs.append(times) timeseqs2.append(times2) numlines += 1 divisor = np.mean([item for sublist in timeseqs for item in sublist]) print('divisor: {}'.format(divisor)) divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist]) print('divisor2: {}'.format(divisor2)) elems_per_fold = int(round(numlines / 3)) fold1 = lines[:elems_per_fold] fold1_group = lines_group[:elems_per_fold] fold1_t = timeseqs[:elems_per_fold] fold1_t2 = timeseqs2[:elems_per_fold] fold2 = lines[elems_per_fold:2 * elems_per_fold] fold2_group = lines_group[elems_per_fold:2 * elems_per_fold] fold2_t = timeseqs[elems_per_fold:2 * elems_per_fold] fold2_t2 = timeseqs2[elems_per_fold:2 * elems_per_fold] fold3 = lines[2 * elems_per_fold:] fold3_group = lines_group[2 * elems_per_fold:] fold3_t = timeseqs[2 * elems_per_fold:] fold3_t2 = timeseqs2[2 * elems_per_fold:] lines = fold1 + fold2 lines_group = fold1_group + fold2_group lines_t = fold1_t + fold2_t lines_t2 = fold1_t2 + fold2_t2 lines = map(lambda x: x + '!', lines) maxlen = max(map(lambda x: len(x), lines)) chars = map(lambda x: set(x), lines) chars = list(set().union(*chars)) chars.sort() target_chars = copy.copy(chars) chars.remove('!') print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars))) char_indices = dict((c, i) for i, c in enumerate(chars)) target_char_indices = dict((c, i) for i, c in enumerate(target_chars)) # lines_group = map(lambda x: x+'!', lines_group) chars_group = map(lambda x: set(x), lines_group) chars_group = list(set().union(*chars_group)) chars_group.sort() target_chars_group = copy.copy(chars_group) # chars_group.remove('!') print('total groups: {}, target groups: {}'.format( len(chars_group), len(target_chars_group))) char_indices_group = dict((c, i) for i, c in enumerate(chars_group)) target_char_indices_group = dict( (c, i) for i, c in enumerate(target_chars_group)) csvfile = open('../data/final_experiments/%s' % eventlog, 'r') spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') next(spamreader, None) # skip the headers lastcase = '' line = '' line_group = '' first_line = True lines = [] lines_group = [] timeseqs = [] timeseqs2 = [] timeseqs3 = [] timeseqs4 = [] times = [] times2 = [] times3 = [] times4 = [] numlines = 0 casestarttime = None lasteventtime = None for row in spamreader: t = time.strptime(row[2], "%Y/%m/%d %H:%M:%S") if row[0] != lastcase: casestarttime = t lasteventtime = t lastcase = row[0] if not first_line: lines.append(line) lines_group.append(line_group) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) timeseqs4.append(times4) line = '' line_group = '' times = [] times2 = [] times3 = [] times4 = [] numlines += 1 line += get_unicode_from_int(row[1]) line_group += get_unicode_from_int(row[3]) timesincelastevent = datetime.fromtimestamp( time.mktime(t)) - datetime.fromtimestamp( time.mktime(lasteventtime)) timesincecasestart = datetime.fromtimestamp( time.mktime(t)) - datetime.fromtimestamp( time.mktime(casestarttime)) midnight = datetime.fromtimestamp(time.mktime(t)).replace( hour=0, minute=0, second=0, microsecond=0) timesincemidnight = datetime.fromtimestamp(time.mktime(t)) - midnight timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds timediff3 = timesincemidnight.seconds timediff4 = datetime.fromtimestamp(time.mktime(t)).weekday() times.append(timediff) times2.append(timediff2) times3.append(timediff3) times4.append(timediff4) lasteventtime = t first_line = False # add last case lines.append(line) lines_group.append(line_group) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) timeseqs4.append(times4) numlines += 1 elems_per_fold = int(round(numlines / 3)) fold1 = lines[:elems_per_fold] fold1_group = lines_group[:elems_per_fold] fold1_t = timeseqs[:elems_per_fold] fold1_t2 = timeseqs2[:elems_per_fold] fold1_t3 = timeseqs3[:elems_per_fold] fold1_t4 = timeseqs4[:elems_per_fold] with open('output_files/folds/fold1.csv', 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row, timeseq in zip(fold1, fold1_t): spamwriter.writerow([ unicode(s).encode("utf-8") + '#{}'.format(t) for s, t in zip(row, timeseq) ]) fold2 = lines[elems_per_fold:2 * elems_per_fold] fold2_group = lines_group[elems_per_fold:2 * elems_per_fold] fold2_t = timeseqs[elems_per_fold:2 * elems_per_fold] fold2_t2 = timeseqs2[elems_per_fold:2 * elems_per_fold] fold2_t3 = timeseqs3[elems_per_fold:2 * elems_per_fold] fold2_t4 = timeseqs4[elems_per_fold:2 * elems_per_fold] with open('output_files/folds/fold2.csv', 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row, timeseq in zip(fold2, fold2_t): spamwriter.writerow([ unicode(s).encode("utf-8") + '#{}'.format(t) for s, t in zip(row, timeseq) ]) fold3 = lines[2 * elems_per_fold:] fold3_group = lines_group[2 * elems_per_fold:] fold3_t = timeseqs[2 * elems_per_fold:] fold3_t2 = timeseqs2[2 * elems_per_fold:] fold3_t3 = timeseqs3[2 * elems_per_fold:] fold3_t4 = timeseqs4[2 * elems_per_fold:] with open('output_files/folds/fold3.csv', 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row, timeseq in zip(fold3, fold3_t): spamwriter.writerow([ unicode(s).encode("utf-8") + '#{}'.format(t) for s, t in zip(row, timeseq) ]) lines = fold1 + fold2 lines_group = fold1_group + fold2_group lines_t = fold1_t + fold2_t lines_t2 = fold1_t2 + fold2_t2 lines_t3 = fold1_t3 + fold2_t3 lines_t4 = fold1_t4 + fold2_t4 step = 1 sentences = [] sentences_group = [] softness = 0 next_chars = [] next_chars_group = [] lines = map(lambda x: x + '!', lines) lines_group = map(lambda x: x + '!', lines_group) sentences_t = [] sentences_t2 = [] sentences_t3 = [] sentences_t4 = [] next_chars_t = [] next_chars_t2 = [] next_chars_t3 = [] next_chars_t4 = [] for line, line_group, line_t, line_t2, line_t3, line_t4 in zip( lines, lines_group, lines_t, lines_t2, lines_t3, lines_t4): for i in range(0, len(line), step): if i == 0: continue sentences.append(line[0:i]) sentences_group.append(line_group[0:i]) sentences_t.append(line_t[0:i]) sentences_t2.append(line_t2[0:i]) sentences_t3.append(line_t3[0:i]) sentences_t4.append(line_t4[0:i]) next_chars.append(line[i]) next_chars_group.append(line_group[i]) if i == len( line) - 1: # special case to deal time of end character next_chars_t.append(0) next_chars_t2.append(0) next_chars_t3.append(0) next_chars_t4.append(0) else: next_chars_t.append(line_t[i]) next_chars_t2.append(line_t2[i]) next_chars_t3.append(line_t3[i]) next_chars_t4.append(line_t4[i]) print('nb sequences:', len(sentences)) print('Vectorization...') num_features = len(chars) + len(chars_group) + 5 print('num features: {}'.format(num_features)) print('MaxLen: ', maxlen) X = np.zeros((len(sentences), maxlen, num_features), dtype=np.float32) y_a = np.zeros((len(sentences), len(target_chars)), dtype=np.float32) y_g = np.zeros((len(sentences), len(target_chars_group)), dtype=np.float32) y_t = np.zeros((len(sentences)), dtype=np.float32) for i, sentence in enumerate(sentences): leftpad = maxlen - len(sentence) next_t = next_chars_t[i] sentence_group = sentences_group[i] sentence_t = sentences_t[i] sentence_t2 = sentences_t2[i] sentence_t3 = sentences_t3[i] sentence_t4 = sentences_t4[i] for t, char in enumerate(sentence): multiset_abstraction = Counter(sentence[:t + 1]) for c in chars: if c == char: X[i, t + leftpad, char_indices[c]] = 1 for g in chars_group: if g == sentence_group[t]: X[i, t + leftpad, len(chars) + char_indices_group[g]] = 1 X[i, t + leftpad, len(chars) + len(chars_group)] = t + 1 X[i, t + leftpad, len(chars) + len(chars_group) + 1] = sentence_t[t] / divisor X[i, t + leftpad, len(chars) + len(chars_group) + 2] = sentence_t2[t] / divisor2 X[i, t + leftpad, len(chars) + len(chars_group) + 3] = sentence_t3[t] / 86400 X[i, t + leftpad, len(chars) + len(chars_group) + 4] = sentence_t4[t] / 7 for c in target_chars: if c == next_chars[i]: y_a[i, target_char_indices[c]] = 1 - softness else: y_a[i, target_char_indices[c]] = softness / (len(target_chars) - 1) for g in target_chars_group: if g == next_chars_group[i]: y_g[i, target_char_indices_group[g]] = 1 - softness else: y_g[i, target_char_indices_group[g]] = softness / ( len(target_chars_group) - 1) y_t[i] = next_t / divisor np.set_printoptions(threshold=np.nan) config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.5 set_session(tf.Session(config=config)) # build the model: print('Build model...') main_input = Input(shape=(maxlen, num_features), name='main_input') # train a 2-layer LSTM with one shared layer # the shared layer l1 = LSTM(100, consume_less='gpu', init='glorot_uniform', return_sequences=True, dropout_W=0.2)(main_input) b1 = BatchNormalization()(l1) # the layer specialized in activity prediction l2_1 = LSTM(100, consume_less='gpu', init='glorot_uniform', return_sequences=False, dropout_W=0.2)(b1) b2_1 = BatchNormalization()(l2_1) # the layer specialized in time prediction l2_2 = LSTM(100, consume_less='gpu', init='glorot_uniform', return_sequences=False, dropout_W=0.2)(b1) b2_2 = BatchNormalization()(l2_2) # the layer specialized in resource prediction l2_3 = LSTM(100, consume_less='gpu', init='glorot_uniform', return_sequences=False, dropout_W=0.2)(b1) b2_3 = BatchNormalization()(l2_3) act_output = Dense(len(target_chars), activation='softmax', init='glorot_uniform', name='act_output')(b2_1) group_output = Dense(len(target_chars_group), activation='softmax', init='glorot_uniform', name='group_output')(b2_3) time_output = Dense(1, init='glorot_uniform', name='time_output')(b2_2) model = Model(input=[main_input], output=[act_output, group_output, time_output]) opt = Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004, clipvalue=3) model.compile(loss={ 'act_output': 'categorical_crossentropy', 'group_output': 'categorical_crossentropy', 'time_output': 'mae' }, optimizer=opt) early_stopping = EarlyStopping(monitor='val_loss', patience=42) path_to_model = 'output_files/final_experiments/models/CFR/' + eventlog[:-4] + \ '/model_{epoch:02d}-{val_loss:.2f}.h5' model_checkpoint = ModelCheckpoint(path_to_model, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto') lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=0, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) model.fit(X, { 'act_output': y_a, 'time_output': y_t, 'group_output': y_g }, validation_split=0.2, verbose=2, callbacks=[early_stopping, model_checkpoint, lr_reducer], batch_size=maxlen, nb_epoch=300)
def prepare_testing_data(eventlog): csvfile = open('../data/final_experiments/%s' % eventlog, 'r') spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') next(spamreader, None) # skip the headers lastcase = '' line = '' line_group = '' first_line = True lines_id = [] lines = [] lines_group = [] timeseqs = [] # relative time since previous event timeseqs2 = [] # relative time since case start timeseqs3 = [] # absolute time of previous event timeseqs4 = [] # absolute time of event as a string times = [] times2 = [] times3 = [] times4 = [] numlines = 0 casestarttime = None lasteventtime = None for row in spamreader: t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S") if row[0] != lastcase: casestarttime = t lasteventtime = t lastcase = row[0] if not first_line: lines.append(line) lines_group.append(line_group) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) timeseqs4.append(times4) lines_id.append(lastcase) line = '' line_group = '' times = [] times2 = [] times3 = [] times4 = [] numlines += 1 line += get_unicode_from_int(row[1]) line_group += get_unicode_from_int(row[3]) timesincelastevent = datetime.fromtimestamp( time.mktime(t)) - datetime.fromtimestamp( time.mktime(lasteventtime)) timesincecasestart = datetime.fromtimestamp( time.mktime(t)) - datetime.fromtimestamp( time.mktime(casestarttime)) timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds times.append(timediff) times2.append(timediff2) times3.append(datetime.fromtimestamp(time.mktime(t))) times4.append(row[2]) lasteventtime = t first_line = False # add last case lines.append(line) lines_group.append(line_group) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) timeseqs4.append(times4) numlines += 1 divisor = np.mean([item for sublist in timeseqs for item in sublist]) divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist]) divisor3 = np.mean( map(lambda x: np.mean(map(lambda y: x[len(x) - 1] - y, x)), timeseqs2)) elems_per_fold = int(round(numlines / 3)) fold1and2lines = lines[:2 * elems_per_fold] fold1and2lines = map(lambda x: x + '!', fold1and2lines) maxlen = max(map(lambda x: len(x), fold1and2lines)) chars = map(lambda x: set(x), fold1and2lines) chars = list(set().union(*chars)) chars.sort() target_chars = copy.copy(chars) chars.remove('!') char_indices = dict((c, i) for i, c in enumerate(chars)) target_char_indices = dict((c, i) for i, c in enumerate(target_chars)) target_indices_char = dict((i, c) for i, c in enumerate(target_chars)) fold1and2lines_group = lines_group[:2 * elems_per_fold] # fold1and2lines_group = map(lambda x: x + '!', fold1and2lines_group) chars_group = map(lambda x: set(x), fold1and2lines_group) chars_group = list(set().union(*chars_group)) chars_group.sort() target_chars_group = copy.copy(chars_group) # chars_group.remove('!') char_indices_group = dict((c, i) for i, c in enumerate(chars_group)) target_char_indices_group = dict( (c, i) for i, c in enumerate(target_chars_group)) target_indices_char_group = dict( (i, c) for i, c in enumerate(target_chars_group)) # we only need the third fold, because first two were used for training fold3 = lines[2 * elems_per_fold:] fold3_id = lines_id[2 * elems_per_fold:] fold3_group = lines_group[2 * elems_per_fold:] fold3_t = timeseqs[2 * elems_per_fold:] fold3_t2 = timeseqs2[2 * elems_per_fold:] fold3_t3 = timeseqs3[2 * elems_per_fold:] fold3_t4 = timeseqs4[2 * elems_per_fold:] lines = fold3 lines_id = fold3_id lines_group = fold3_group lines_t = fold3_t lines_t2 = fold3_t2 lines_t3 = fold3_t3 lines_t4 = fold3_t4 # set parameters predict_size = maxlen return lines, \ lines_id, \ lines_group, \ lines_t, \ lines_t2, \ lines_t3, \ lines_t4, \ maxlen, \ chars, \ chars_group, \ char_indices, \ char_indices_group, \ divisor, \ divisor2, \ divisor3, \ predict_size, \ target_indices_char, \ target_indices_char_group, \ target_char_indices, \ target_char_indices_group
def train(log_name, models_folder, use_old_model): lines = [] lines_group = [] lines_time = [] timeseqs = [] timeseqs2 = [] lastcase = '' line = '' line_group = '' line_time = '' first_line = True times = [] times2 = [] difflist = [] numlines = 0 casestarttime = None lasteventtime = None r = 3 csvfile = open(shared_variables.data_folder + '%s.csv' % log_name, 'r') spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') next(spamreader, None) # skip the headers for row in spamreader: t1 = time.strptime(row[2], "%Y-%m-%d %H:%M:%S") if row[0] != lastcase: lastevent = t1 lastcase = row[0] if row[1] != '0': t2 = datetime.fromtimestamp(time.mktime(t1)) - datetime.fromtimestamp(time.mktime(lastevent)) tdiff = 86400 * t2.days + t2.seconds else: tdiff = 0 difflist.append(tdiff) lastevent = t1 difflist = [int(i) for i in difflist] maxdiff = max(difflist) difflist[np.argmax(difflist)] -= 1e-8 diff = maxdiff / r # difflist.sort() # mediandiff = np.percentile(difflist, 50) # diff = mediandiff / r # print(maxdiff) # print(mediandiff) csvfile.seek(0) next(spamreader, None) # skip the headers line_index = 0 for row in spamreader: t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S") if row[0] != lastcase: casestarttime = t lasteventtime = t lastcase = row[0] if not first_line: lines.append(line) lines_group.append(line_group) lines_time.append(line_time) timeseqs.append(times) timeseqs2.append(times2) line = '' line_group = '' line_time = '' times = [] times2 = [] numlines += 1 line += get_unicode_from_int(row[1]) line_group += get_unicode_from_int(row[3]) # if (difflist[line_index] / diff) <= r: # line_time += get_unicode_from_int(int(int(row[4]) / diff)) # else: line_time += get_unicode_from_int(int(difflist[line_index] / diff)) timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp( time.mktime(lasteventtime)) timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp( time.mktime(casestarttime)) timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds times.append(timediff) times2.append(timediff2) lasteventtime = t first_line = False line_index += 1 # add last case lines.append(line) lines_group.append(line_group) lines_time.append(line_time) timeseqs.append(times) timeseqs2.append(times2) numlines += 1 divisor = np.max([item for sublist in timeseqs for item in sublist]) print('divisor: {}'.format(divisor)) divisor2 = np.max([item for sublist in timeseqs2 for item in sublist]) print('divisor2: {}'.format(divisor2)) elements_per_fold = int(round(numlines / 3)) fold1 = lines[:elements_per_fold] fold1_group = lines_group[:elements_per_fold] fold1_time = lines_time[:elements_per_fold] fold2 = lines[elements_per_fold:2 * elements_per_fold] fold2_group = lines_group[elements_per_fold:2 * elements_per_fold] fold2_time = lines_time[elements_per_fold:2 * elements_per_fold] lines = fold1 + fold2 lines_group = fold1_group + fold2_group lines_time = fold1_time + fold2_time lines = map(lambda x: x + '!', lines) maxlen = max(map(lambda x: len(x), lines)) chars = map(lambda x: set(x), lines) chars = list(set().union(*chars)) chars.sort() target_chars = copy.copy(chars) chars.remove('!') print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars))) char_indices = dict((c, i) for i, c in enumerate(chars)) target_char_indices = dict((c, i) for i, c in enumerate(target_chars)) # lines_group = map(lambda x: x+'!', lines_group) chars_group = map(lambda x: set(x), lines_group) chars_group = list(set().union(*chars_group)) chars_group.sort() target_chars_group = copy.copy(chars_group) # chars_group.remove('!') print('total groups: {}, target groups: {}'.format(len(chars_group), len(target_chars_group))) char_indices_group = dict((c, i) for i, c in enumerate(chars_group)) target_char_indices_group = dict((c, i) for i, c in enumerate(target_chars_group)) chars_time = map(lambda x: set(x), lines_time) chars_time = list(set().union(*chars_time)) chars_time.sort() target_chars_time = copy.copy(chars_time) print('total times: {}, target times: {}'.format(len(chars_time), len(target_chars_time))) char_indices_time = dict((c, i) for i, c in enumerate(chars_time)) target_char_indices_time = dict((c, i) for i, c in enumerate(target_chars_time)) csvfile = open(shared_variables.data_folder + '%s.csv' % log_name, 'r') spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') next(spamreader, None) # skip the headers lastcase = '' line = '' line_group = '' line_time = '' first_line = True lines_id = [] lines = [] lines_group = [] lines_time = [] timeseqs = [] # relative time since previous event timeseqs2 = [] # relative time since case start timeseqs3 = [] # absolute time of previous event timeseqs4 = [] # absolute time of event as a string times = [] times2 = [] times3 = [] times4 = [] numlines = 0 casestarttime = None lasteventtime = None line_index = 0 for row in spamreader: t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S") if row[0] != lastcase: casestarttime = t lasteventtime = t lastcase = row[0] if not first_line: lines.append(line) lines_group.append(line_group) lines_time.append(line_time) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) timeseqs4.append(times4) lines_id.append(lastcase) line = '' line_group = '' times = [] times2 = [] times3 = [] times4 = [] numlines += 1 line += get_unicode_from_int(row[1]) line_group += get_unicode_from_int(row[3]) line_time += get_unicode_from_int(int(difflist[line_index] / diff)) timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp( time.mktime(lasteventtime)) timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp( time.mktime(casestarttime)) midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0) timesincemidnight = datetime.fromtimestamp(time.mktime(t)) - midnight timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds timediff3 = timesincemidnight.seconds timediff4 = datetime.fromtimestamp(time.mktime(t)).weekday() times.append(timediff) times2.append(timediff2) times3.append(timediff3) times4.append(timediff4) lasteventtime = t first_line = False line_index += 1 # add last case lines.append(line) lines_group.append(line_group) lines_time.append(line_time) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) timeseqs4.append(times4) numlines += 1 elements_per_fold = int(round(numlines / 3)) lines = lines[:-elements_per_fold] lines_group = lines_group[:-elements_per_fold] lines_time = lines_time[:-elements_per_fold] lines_t = timeseqs[:-elements_per_fold] lines_t2 = timeseqs2[:-elements_per_fold] lines_t3 = timeseqs3[:-elements_per_fold] lines_t4 = timeseqs4[:-elements_per_fold] step = 1 sentences = [] sentences_group = [] sentences_time = [] softness = 0 next_chars = [] next_chars_group = [] next_chars_time = [] lines = map(lambda x: x + '!', lines) lines_group = map(lambda x: x + '!', lines_group) lines_time = map(lambda x: x + '!', lines_time) sentences_t = [] sentences_t2 = [] sentences_t3 = [] sentences_t4 = [] next_chars_t = [] for line, line_group, line_time, line_t, line_t2, line_t3, line_t4 in izip(lines, lines_group, lines_time, lines_t, lines_t2, lines_t3, lines_t4): for i in range(0, len(line), step): if i == 0: continue sentences.append(line[0: i]) sentences_group.append(line_group[0:i]) sentences_time.append(line_time[0:i]) sentences_t.append(line_t[0:i]) sentences_t2.append(line_t2[0:i]) sentences_t3.append(line_t3[0:i]) sentences_t4.append(line_t4[0:i]) next_chars.append(line[i]) next_chars_group.append(line_group[i]) next_chars_time.append(line_time[i]) if i == len(line) - 1: # special case to deal time of end character next_chars_t.append(0) else: next_chars_t.append(line_t[i]) print('nb sequences:', len(sentences)) print('Vectorization...') num_features = len(chars) + len(chars_group) + len(chars_time) + 5 print('num features: {}'.format(num_features)) print('MaxLen: ', maxlen) X = np.zeros((len(sentences), maxlen, num_features), dtype=np.float32) y_a = np.zeros((len(sentences), len(target_chars)), dtype=np.float32) y_g = np.zeros((len(sentences), len(target_chars_group)), dtype=np.float32) y_t = np.zeros((len(sentences)), dtype=np.float32) y_y = np.zeros((len(sentences), len(target_chars_time)), dtype=np.float32) for i, sentence in enumerate(sentences): leftpad = maxlen - len(sentence) next_t = next_chars_t[i] sentence_group = sentences_group[i] sentence_time = sentences_time[i] sentence_t = sentences_t[i] sentence_t2 = sentences_t2[i] sentence_t3 = sentences_t3[i] sentence_t4 = sentences_t4[i] for t, char in enumerate(sentence): for c in chars: if c == char: X[i, t + leftpad, char_indices[c]] = 1 for g in chars_group: if g == sentence_group[t]: X[i, t + leftpad, len(chars) + char_indices_group[g]] = 1 for y in chars_time: if y == sentence_time[t]: X[i, t + leftpad, len(chars) + len(chars_group) + char_indices_time[y]] = 1 X[i, t + leftpad, len(chars) + len(chars_group) + len(chars_time)] = t + 1 X[i, t + leftpad, len(chars) + len(chars_group) + len(chars_time) + 1] = sentence_t[t] / divisor X[i, t + leftpad, len(chars) + len(chars_group) + len(chars_time) + 2] = sentence_t2[t] / divisor2 X[i, t + leftpad, len(chars) + len(chars_group) + len(chars_time) + 3] = sentence_t3[t] / 86400 X[i, t + leftpad, len(chars) + len(chars_group) + len(chars_time) + 4] = sentence_t4[t] / 7 for c in target_chars: if c == next_chars[i]: y_a[i, target_char_indices[c]] = 1 - softness else: y_a[i, target_char_indices[c]] = softness / (len(target_chars) - 1) for g in target_chars_group: if g == next_chars_group[i]: y_g[i, target_char_indices_group[g]] = 1 - softness else: y_g[i, target_char_indices_group[g]] = softness / (len(target_chars_group) - 1) for y in target_chars_time: if y == next_chars_time[i]: y_y[i, target_char_indices_time[y]] = 1 - softness else: y_y[i, target_char_indices_time[y]] = softness / (len(target_chars_time) - 1) y_t[i] = next_t / divisor for fold in range(folds): model = TrainCFRT._build_model(maxlen, num_features, target_chars, target_chars_time, target_chars_group, use_old_model) checkpoint_name = create_checkpoints_path(log_name, models_folder, fold, 'CFRT') TrainCFRT._train_model(model, checkpoint_name, X, y_a, y_t, y_y, y_g)
def train(log_name, models_folder, use_old_model): # TrainCF._load_dataset(log_name) lines = [] # list of all the activity sequences timeseqs = [] # time sequences (differences between two events) timeseqs2 = [] # time sequences (differences between the current and first) # helper variables last_case = '' line = '' # sequence of activities for one case first_line = True times = [] times2 = [] num_lines = 0 case_start_time = None last_event_time = None csvfile = open(shared_variables.data_folder + '%s.csv' % log_name, 'r') csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|') next(csv_reader, None) # skip the headers for row in csv_reader: # the rows are "CaseID,ActivityID,CompleteTimestamp" t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S") # creates a datetime object from row[2] if row[0] != last_case: # 'last_case' is to save the last executed case for the loop case_start_time = t last_event_time = t last_case = row[0] if not first_line: # here we actually add the sequences to the lists lines.append(line) timeseqs.append(times) timeseqs2.append(times2) line = '' times = [] times2 = [] num_lines += 1 line += get_unicode_from_int(row[1]) time_since_last_event = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp( time.mktime(last_event_time)) time_since_case_start = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp( time.mktime(case_start_time)) time_diff = 86400 * time_since_last_event.days + time_since_last_event.seconds time_diff2 = 86400 * time_since_case_start.days + time_since_case_start.seconds times.append(time_diff) times2.append(time_diff2) last_event_time = t first_line = False # add last case lines.append(line) timeseqs.append(times) timeseqs2.append(times2) num_lines += 1 divisor = 1.0 * np.max([item for sublist in timeseqs for item in sublist]) # average time between events print('divisor: {}'.format(divisor)) divisor2 = 1.0 * np.max([item for sublist in timeseqs2 for item in sublist]) # average time between current and # first events print('divisor2: {}'.format(divisor2)) # separate training data into 2(out of 3) parts elements_per_fold = int(round(num_lines / 3)) many = 0 for i in range(len(lines)): many = many + len(lines[i]) get_lengths = lambda x : [len(a) for a in x] print("mean length of the trace: ", np.mean(get_lengths(lines))) print("median length of the trace: ", np.median(get_lengths(lines))) print("number of traces: ", len(lines)) fold1 = lines[:elements_per_fold] fold2 = lines[elements_per_fold:2 * elements_per_fold] lines = fold1 + fold2 lines = map(lambda x: x + '!', lines) # put delimiter symbol maxlen = max(map(lambda x: len(x), lines)) # find maximum line size # next lines here to get all possible characters for events and annotate them with numbers chars = map(lambda x: set(x), lines) # remove duplicate activities from each separate case chars = list(set().union(*chars)) # creates a list of all the unique activities in the data set chars.sort() # sorts the chars in alphabetical order target_chars = copy.copy(chars) chars.remove('!') print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars))) char_indices = dict((c, i) for i, c in enumerate(chars)) target_char_indices = dict((c, i) for i, c in enumerate(target_chars)) csvfile = open(shared_variables.data_folder + '%s.csv' % log_name, 'r') csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|') next(csv_reader, None) # skip the headers last_case = '' line = '' first_line = True lines = [] timeseqs = [] timeseqs2 = [] timeseqs3 = [] timeseqs4 = [] times = [] times2 = [] times3 = [] times4 = [] num_lines = 0 case_start_time = None last_event_time = None for row in csv_reader: t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S") # new case starts if row[0] != last_case: case_start_time = t last_event_time = t last_case = row[0] if not first_line: lines.append(line) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) timeseqs4.append(times4) line = '' times = [] times2 = [] times3 = [] times4 = [] num_lines += 1 line += get_unicode_from_int(row[1]) time_since_last_event = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp( time.mktime(last_event_time)) time_since_case_start = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp( time.mktime(case_start_time)) midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0) timesincemidnight = datetime.fromtimestamp(time.mktime(t)) - midnight time_diff = 86400 * time_since_last_event.days + time_since_last_event.seconds time_diff2 = 86400 * time_since_case_start.days + time_since_case_start.seconds timediff3 = timesincemidnight.seconds # this leaves only time even occurred after midnight timediff4 = datetime.fromtimestamp(time.mktime(t)).weekday() # day of the week times.append(time_diff) times2.append(time_diff2) times3.append(timediff3) times4.append(timediff4) last_event_time = t first_line = False # add last case lines.append(line) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) timeseqs4.append(times4) num_lines += 1 elements_per_fold = int(round(num_lines / 3)) lines = lines[:-elements_per_fold] lines_t = timeseqs[:-elements_per_fold] lines_t2 = timeseqs2[:-elements_per_fold] lines_t3 = timeseqs3[:-elements_per_fold] lines_t4 = timeseqs4[:-elements_per_fold] step = 1 sentences = [] softness = 0 next_chars = [] lines = map(lambda x: x + '!', lines) sentences_t = [] sentences_t2 = [] sentences_t3 = [] sentences_t4 = [] next_chars_t = [] for line, line_t, line_t2, line_t3, line_t4 in izip(lines, lines_t, lines_t2, lines_t3, lines_t4): for i in range(0, len(line), step): if i == 0: continue # we add iteratively, first symbol of the line, then two first, three... sentences.append(line[0: i]) sentences_t.append(line_t[0:i]) sentences_t2.append(line_t2[0:i]) sentences_t3.append(line_t3[0:i]) sentences_t4.append(line_t4[0:i]) next_chars.append(line[i]) if i == len(line) - 1: # special case to deal time of end character next_chars_t.append(0) else: next_chars_t.append(line_t[i]) print('nb sequences:', len(sentences)) print('Vectorization...') num_features = len(chars) + 5 print('num features: {}'.format(num_features)) X = np.zeros((len(sentences), maxlen, num_features), dtype=np.float32) y_a = np.zeros((len(sentences), len(target_chars)), dtype=np.float32) y_t = np.zeros((len(sentences)), dtype=np.float32) for i, sentence in enumerate(sentences): leftpad = maxlen - len(sentence) next_t = next_chars_t[i] sentence_t = sentences_t[i] sentence_t2 = sentences_t2[i] sentence_t3 = sentences_t3[i] sentence_t4 = sentences_t4[i] for t, char in enumerate(sentence): # multiset_abstraction = Counter(sentence[:t+1]) for c in chars: if c == char: # this will encode present events to the right places X[i, t + leftpad, char_indices[c]] = 1 X[i, t + leftpad, len(chars)] = t + 1 X[i, t + leftpad, len(chars) + 1] = sentence_t[t] / divisor X[i, t + leftpad, len(chars) + 2] = sentence_t2[t] / divisor2 X[i, t + leftpad, len(chars) + 3] = sentence_t3[t] / 86400 X[i, t + leftpad, len(chars) + 4] = sentence_t4[t] / 7 for c in target_chars: if c == next_chars[i]: y_a[i, target_char_indices[c]] = 1 - softness else: y_a[i, target_char_indices[c]] = softness / (len(target_chars) - 1) y_t[i] = next_t / divisor for fold in range(folds): # model = build_model(max_length, num_features, max_activity_id) model = TrainCF._build_model(maxlen, num_features, target_chars, use_old_model) checkpoint_name = create_checkpoints_path(log_name, models_folder, fold, 'CF') TrainCF._train_model(model, checkpoint_name, X, y_a, y_t)