def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame): """returns the encoded logs returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible :param job: job configuration :param use_cache: load or not saved datasets from cache :return: training and testing DataFrame """ print('\tGetting Dataset') if use_cache: if LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).exists(): training_df, test_df = get_labelled_logs(job) else: if job.split.train_log is not None and \ job.split.test_log is not None and \ LoadedLog.objects.filter(train_log=job.split.train_log.path, test_log=job.split.test_log.path).exists(): training_log, test_log, additional_columns = get_loaded_logs( job.split) else: training_log, test_log, additional_columns = prepare_logs( job.split) if job.split.type == SplitTypes.SPLIT_SINGLE.value: job.split = duplicate_orm_row(job.split) job.split.type = SplitTypes.SPLIT_DOUBLE.value train_name = '0-' + str( int(100 - (job.split.test_size * 100))) job.split.train_log = create_log(EventLog(training_log), train_name + '.xes') test_name = str(int(100 - (job.split.test_size * 100))) + '-100' job.split.test_log = create_log(EventLog(test_log), test_name + '.xes') job.split.additional_columns = str( train_name + test_name) # TODO: find better naming policy job.save() put_loaded_logs(job.split, training_log, test_log, additional_columns) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) put_labelled_logs(job, training_df, test_df) else: training_log, test_log, additional_columns = prepare_logs(job.split) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) return training_df, test_df
def test_random(self): split = split_single( split_ordering=SplitOrderingMethods.SPLIT_RANDOM.value) training_log1, _, _ = prepare_logs(split) training_log2, _, _ = prepare_logs(split) training_names1 = trace_names(training_log1) training_names2 = trace_names(training_log2) self.assertNotEqual(training_names1, training_names2)
def test_sequential(self): split = split_single( split_ordering=SplitOrderingMethods.SPLIT_SEQUENTIAL.value) training_log, test_log, _ = prepare_logs(split) training_names = trace_names(training_log) test_names = trace_names(test_log) self.assertListEqual(['3', '2', '1', '6'], training_names) self.assertListEqual(['5', '4'], test_names)
def test_strict_temporal(self): split = split_single( split_ordering=SplitOrderingMethods.SPLIT_STRICT_TEMPORAL.value) training_log, test_log, _ = prepare_logs(split) training_names = trace_names(training_log) test_names = trace_names(test_log) # Modified log to have only one trace here self.assertListEqual(['1'], sorted(training_names)) self.assertListEqual(sorted(['6', '4']), sorted(test_names))
def test_temporal(self): split = split_single( split_ordering=SplitOrderingMethods.SPLIT_TEMPORAL.value) training_log, test_log, _ = prepare_logs(split) training_names = trace_names(training_log) test_names = trace_names(test_log) self.assertListEqual(sorted(['1', '2', '3', '5']), sorted(training_names)) self.assertListEqual(sorted(['6', '4']), sorted(test_names))
def test_size(self): split = split_single(test_size=0.5) training_log, test_log, _ = prepare_logs(split) self.assertEqual(3, len(training_log)) self.assertEqual(3, len(test_log))
def test_split_double(self): training_log, test_log, _ = prepare_logs(split_double()) self.assertEqual(4, len(training_log)) self.assertEqual(2, len(test_log))