def test_next_activity_zero_padding_elapsed_time(self): labelling = create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value) encoding = create_test_encoding( value_encoding=ValueEncodings.SIMPLE_INDEX.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=10, padding=True) df, _ = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 13)) self.assertTrue('elapsed_time' in df.columns.values.tolist()) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual(trace_5, ['5', 1, 3, 2, 2, 2, 0, 0, 0, 0, 1296240.0, 2]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual(trace_4, [ '4', 52903968, 32171502, 17803069, 1149821, 72523760, 0, 0, 0, 0, 0, 520920.0, 0 ])
def test_label_remaining_time_with_elapsed_time_custom_threshold(self): encoding = create_test_encoding( value_encoding=ValueEncodings.SIMPLE_INDEX.value, add_elapsed_time=True, add_remaining_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=2) labelling = create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value, threshold_type=ThresholdTypes.THRESHOLD_CUSTOM.value, threshold=40000) df, _ = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 5)) self.assertListEqual( df.columns.values.tolist(), ['trace_id', 'prefix_1', 'prefix_2', 'elapsed_time', 'label']) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual(trace_5, ['5', 1, 2, 0, 0]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual(trace_4, ['4', 1, 1, 0, 0])
def test_attribute_number(self): encoding = create_test_encoding( value_encoding=ValueEncodings.BOOLEAN.value, prefix_length=2, task_generation_type=TaskGenerationTypes.ONLY_THIS.value) labelling = create_test_labelling( label_type=LabelTypes.ATTRIBUTE_NUMBER.value, attribute_name='AMOUNT') _, df = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 9)) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual( trace_5, ['5', True, True, False, False, False, False, False, False]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual( trace_4, ['4', True, False, True, False, False, False, False, True])
def test_hyperopt(self): job = Job.objects.create( split=create_test_split(split_type=SplitTypes.SPLIT_SINGLE.value, original_log=create_test_log( log_name=general_example_filename, log_path=general_example_filepath)), encoding=create_test_encoding( value_encoding=ValueEncodings.SIMPLE_INDEX.value, prefix_length=3, padding=False), labelling=create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value), clustering=create_test_clustering( clustering_type=ClusteringMethods.KMEANS.value), predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.RANDOM_FOREST.value), hyperparameter_optimizer=create_test_hyperparameter_optimizer( hyperoptim_type=HyperparameterOptimizationMethods.HYPEROPT. value, performance_metric=HyperOptLosses.ACC.value, max_evals=2)) prediction_task(job.pk) job = Job.objects.get(pk=1) self.assertFalse(classification_random_forest( ) == job.predictive_model.classification.__getattribute__( ClassificationMethods.RANDOM_FOREST.value.lower()).to_dict())
def test_ValueError_decode(self): try: encoder = Encoder(df=self.df, encoding=self.encoding) encoder.decode(df=self.df, encoding=create_test_encoding(data_encoding='None')) except ValueError: pass
def test_rnn_time_series_predictor_complex_no_exceptions(self): encoding = create_test_encoding( value_encoding=ValueEncodings.COMPLEX.value, prefix_length=5, padding=True) labelling = create_test_labelling( label_type=LabelTypes.DURATION.value, threshold_type=ThresholdTypes.THRESHOLD_MEAN.value) train_df = complex(self.train_log, labelling, encoding, self.train_add_col) test_df = complex(self.test_log, labelling, encoding, self.test_add_col) train_df, targets_df = self._drop_columns_and_split(train_df) test_df, _ = self._drop_columns_and_split(test_df) config = self._get_rnn_default_config( encoding=ValueEncodings.COMPLEX.value) rnn_time_series_predictor = RNNTimeSeriesPredictor(**config) # with HidePrints(): rnn_time_series_predictor.fit(train_df) rnn_time_series_predictor.predict(test_df)
def test_NotImplementedException_init_encoder(self): try: Encoder(df=self.df, encoding=create_test_encoding( data_encoding=DataEncodings.ONE_HOT_ENCODER.value)) except NotImplementedError: pass
def test_nn_regressor_complex_no_exceptions(self): encoding = create_test_encoding( value_encoding=ValueEncodings.COMPLEX.value, prefix_length=2, padding=True) labelling = create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value) train_df = complex(self.train_log, labelling, encoding, self.train_add_col) test_df = complex(self.test_log, labelling, encoding, self.test_add_col) train_df, targets_df = self._drop_columns_and_split(train_df) targets_df = targets_df.values.ravel() test_df, _ = self._drop_columns_and_split(test_df) config = self._get_nn_default_config( encoding=ValueEncodings.COMPLEX.value) nn_regressor = NNRegressor(**config) # with HidePrints(): nn_regressor.fit(train_df, targets_df) nn_regressor.predict(test_df)
def test_next_activity_zero_padding_elapsed_time(self): labelling = create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value) encoding = create_test_encoding( value_encoding=ValueEncodings.BOOLEAN.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=3) _, df = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 10)) self.assertTrue('elapsed_time' in df.columns.values.tolist()) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual(trace_5, [ '5', True, True, True, False, False, False, False, 181200.0, 'decide' ]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual(trace_4, [ '4', True, False, True, False, False, False, True, 171660.0, 'decide' ])
def test_label_remaining_time_with_elapsed_time_custom_threshold(self): labelling = create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value, threshold_type=ThresholdTypes.THRESHOLD_CUSTOM.value, threshold=40000) encoding = create_test_encoding( value_encoding=ValueEncodings.BOOLEAN.value, prefix_length=3, add_elapsed_time=True, add_remaining_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value) _, df = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 10)) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual(trace_5, [ '5', True, True, True, False, False, False, False, 181200.0, False ]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual(trace_4, [ '4', True, False, True, False, False, False, True, 171660.0, False ])
def test_next_activity_kmeans(self): self.max_diff = None job = create_test_job( clustering=create_test_clustering( clustering_type=ClusteringMethods.KMEANS.value), split=repair_example(), encoding=create_test_encoding(prefix_length=8, padding=True), labelling=create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value), predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.RANDOM_FOREST.value)) result, _ = calculate(job) del result['elapsed_time'] self.assertDictEqual( result, { 'f1score': 0.54239884582595577, 'acc': 0.80995475113122173, 'true_positive': '--', 'true_negative': '--', 'false_negative': '--', 'false_positive': '--', 'precision': 0.62344720496894401, 'recall': 0.5224945442336747, 'auc': 0.4730604801339352 })
def test_class_no_cluster(self): self.max_diff = None job = create_test_job( clustering=create_test_clustering( clustering_type=ClusteringMethods.NO_CLUSTER.value), split=repair_example(), encoding=create_test_encoding(prefix_length=5, padding=True, add_elapsed_time=True), predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.RANDOM_FOREST.value)) result, _ = calculate(job) del result['elapsed_time'] self.assertDictEqual( result, { 'f1score': 1.0, 'acc': 1.0, 'true_positive': '--', 'true_negative': '--', 'false_negative': '--', 'false_positive': '--', 'precision': 1.0, 'recall': 1.0, 'auc': 0.0 })
def test_no_exceptions(self): filtered_labels = [enum.value for enum in LabelTypes] filtered_classification_methods = [ enum.value for enum in ClassificationMethods ] filtered_encoding_methods = [enum.value for enum in ValueEncodings] filtered_padding = [True, False] choices = [ filtered_encoding_methods, filtered_padding, filtered_classification_methods, filtered_labels ] job_combinations = list(itertools.product(*choices)) for (encoding, padding, method, label) in job_combinations: print(encoding, padding, method, label) if method == 'nn' and (padding == False or label == LabelTypes.ATTRIBUTE_STRING.value): pass job = create_test_job( predictive_model=create_test_predictive_model( prediction_method=method), encoding=create_test_encoding(value_encoding=encoding, padding=padding), labelling=create_test_labelling(label_type=label)) # with HidePrints(): calculate(job)
def test_explain(self): split = create_test_split( split_type=SplitTypes.SPLIT_DOUBLE.value, split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value, test_size=0.2, original_log=None, train_log=create_test_log( log_name='train_explainability.xes', log_path='cache/log_cache/test_logs/train_explainability.xes' ), test_log=create_test_log( log_name='test_explainability.xes', log_path='cache/log_cache/test_logs/test_explainability.xes' ) ) predictive_model = create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.DECISION_TREE.value ) job = create_test_job( split=split, encoding=create_test_encoding( prefix_length=4, padding=True, value_encoding=ValueEncodings.SIMPLE_INDEX.value ), labelling=create_test_labelling(label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label'), clustering=None, create_models=True, predictive_model=predictive_model, job_type=JobTypes.PREDICTION.value, hyperparameter_optimizer=None, incremental_train=None ) prediction_task(job.id, do_publish_result=False) job.refresh_from_db() exp = Explanation.objects.get_or_create( type=ExplanationTypes.SHAP.value, split=split, predictive_model=predictive_model, job=job, results={} )[0] training_df_old, test_df_old = get_encoded_logs(job) explanation_target = '2_101' prefix_target = 'prefix_1' explanation = explain(exp, training_df_old, test_df_old, explanation_target, prefix_target) training_df_old, test_df_old = get_encoded_logs(job) explanation_temp = shap_temporal_stability(exp, training_df_old, test_df_old, explanation_target) self.assertTrue(type(explanation) is dict) self.assertTrue(type(explanation_temp) is dict)
def test_prefix1_no_elapsed_time(self): encoding = create_test_encoding( value_encoding=ValueEncodings.FREQUENCY.value, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=1) df = frequency(self.log, self.event_names, self.labelling, encoding) self.assertEqual(df.shape, (2, 9)) self.assertNotIn('elapsed_time', df.columns.values.tolist())
def setUp(self): self.log = get_log(create_test_log(log_name=general_example_test_filename, log_path=general_example_test_filepath)) self.labelling = create_test_labelling(label_type=LabelTypes.REMAINING_TIME.value) self.encoding = create_test_encoding( value_encoding=ValueEncodings.SIMPLE_INDEX.value, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, add_elapsed_time=True, prefix_length=1)
def test_prefix10(self): encoding = create_test_encoding( value_encoding=ValueEncodings.LAST_PAYLOAD.value, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, add_elapsed_time=True, prefix_length=10) df = last_payload(self.log, self.labelling, encoding, self.add_col) self.assertEqual(df.shape, (1, 17)) self.assertFalse(df.isnull().values.any())
def test_prefix10_zero_padding_all_in_one(self): encoding = create_test_encoding( value_encoding=ValueEncodings.LAST_PAYLOAD.value, task_generation_type=TaskGenerationTypes.ALL_IN_ONE.value, add_elapsed_time=True, prefix_length=10, padding=True) df = last_payload(self.log, self.labelling, encoding, self.add_col) self.assertEqual(df.shape, (15, 17)) self.assertFalse(df.isnull().values.any())
def setUp(self): self.train_log = get_log(create_test_log(log_name=general_example_train_filename, log_path=general_example_train_filepath)) self.test_log = get_log(create_test_log(log_name=general_example_test_filename, log_path=general_example_test_filepath_xes)) self.encoding = create_test_encoding( value_encoding=ValueEncodings.SIMPLE_INDEX.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=1)
def test_duration(self): """Trace atr, zero padding means prefix length has no effect""" job = self.get_job() job.labelling = create_test_labelling(label_type=LabelTypes.DURATION.value) job.save() result1, _ = calculate(job) job.encoding = create_test_encoding(prefix_length=22, padding=True) job.save() result2, _ = calculate(job) self.assertEqual(result1, result2)
def get_classification_job(predictive_model: str, prediction_method: str, metric: HyperOptLosses = HyperOptLosses.ACC.value): encoding = create_test_encoding(prefix_length=8, padding=True) pred_model = create_test_predictive_model(predictive_model=predictive_model, prediction_method=prediction_method) hyperparameter_optimizer = create_test_hyperparameter_optimizer(performance_metric=metric) job = create_test_job(predictive_model=pred_model, encoding=encoding, hyperparameter_optimizer=hyperparameter_optimizer) return job
def setUp(self): self.train_log = get_log( create_test_log(log_name=general_example_train_filename, log_path=general_example_train_filepath)) self.test_log = get_log( create_test_log(log_name=general_example_test_filename, log_path=general_example_test_filepath)) self.add_col = get_additional_columns(self.train_log) self.encoding = create_test_encoding( value_encoding=ValueEncodings.COMPLEX.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=2) self.encodingPadding = create_test_encoding( value_encoding=ValueEncodings.COMPLEX.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=10, padding=True)
def test_prefix10_all_in_one(self): encoding = create_test_encoding( value_encoding=ValueEncodings.COMPLEX.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ALL_IN_ONE.value, prefix_length=10) df = complex(self.log, self.labelling, encoding, self.add_col) self.assertEqual(df.shape, (10, 55)) self.assertFalse(df.isnull().values.any())
def test_prefix5(self): encoding = create_test_encoding( value_encoding=ValueEncodings.COMPLEX.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=5) df = complex(self.log, self.labelling, encoding, self.add_col) self.assertEqual(df.shape, (2, 30)) self.assertFalse(df.isnull().values.any())
def setUp(self): self.log = get_log(create_test_log(log_name=general_example_test_filename, log_path=general_example_test_filepath)) self.event_names = unique_events(self.log) self.labelling = create_test_labelling(label_type=LabelTypes.REMAINING_TIME.value) self.add_col = get_additional_columns(self.log) self.encoding = create_test_encoding( value_encoding=ValueEncodings.LAST_PAYLOAD.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=1)
def test_prefix2(self): df = simple_index(self.log, self.labelling, create_test_encoding( value_encoding=ValueEncodings.FREQUENCY.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=2)) self.assertEqual(df.shape, (2, 5)) row1 = df[df.trace_id == '5'].iloc[0] self.assertListEqual(['5', 'register request', 'examine casually', 90840.0, 1485600.0], row1.values.tolist()) row2 = df[df.trace_id == '4'].iloc[0] self.assertListEqual(['4', 'register request', 'check ticket', 75840.0, 445080.0], row2.values.tolist())
def test_prefix10_padding(self): encoding = create_test_encoding( value_encoding=ValueEncodings.BOOLEAN.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=10, padding=True) df = boolean(self.log, self.event_names, self.labelling, encoding) self.assertEqual(df.shape, (2, 10)) row1 = df[df.trace_id == '4'].iloc[0] self.assertListEqual(['4', True, False, True, True, False, True, True, 520920.0, 0.0], row1.values.tolist())
def test_prefix10_padding_all_in_one(self): encoding = create_test_encoding(value_encoding=ValueEncodings.BOOLEAN.value, prefix_length=10, add_elapsed_time=True, padding=True, task_generation_type=ALL_IN_ONE) df = boolean(self.log, self.event_names, self.labelling, encoding) self.assertEqual(df.shape, (15, 10)) row1 = df[df.trace_id == '4'].iloc[4] self.assertListEqual(['4', True, False, True, True, False, True, True, 520920.0, 0.0], row1.values.tolist()) self.assertFalse(df.isnull().values.any())
def setUp(self): self.df = DataFrame({ 'literal_feature': [str(item) for item in ['a', 'b', None]], 'numeric_feature': [str(item) for item in [.1, 1, -.99]], 'misc_feature': [str(item) for item in ['a', None, -.99]] }) self.how_it_should_be = DataFrame({ 'literal_feature': [2, 3, 1], 'numeric_feature': [2, 3, 0], 'misc_feature': [3, 2, 0] }) self.encoding = create_test_encoding()
def test_shape(self): encoding = create_test_encoding( value_encoding=ValueEncodings.LAST_PAYLOAD.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=2) df = last_payload(self.log, self.labelling, encoding, self.add_col) self.assertEqual((2, 9), df.shape) headers = ['trace_id', 'prefix_1', 'prefix_2', 'Activity_2', 'Costs_2', 'Resource_2', 'org:resource_2', 'elapsed_time', 'label'] self.assertListEqual(headers, df.columns.values.tolist())