Exemple #1
0
    def test_next_activity_zero_padding_elapsed_time(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.NEXT_ACTIVITY.value)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.SIMPLE_INDEX.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=10,
            padding=True)

        df, _ = encode_label_logs(
            self.test_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 13))
        self.assertTrue('elapsed_time' in df.columns.values.tolist())
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(trace_5,
                             ['5', 1, 3, 2, 2, 2, 0, 0, 0, 0, 1296240.0, 2])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(trace_4, [
            '4', 52903968, 32171502, 17803069, 1149821, 72523760, 0, 0, 0, 0,
            0, 520920.0, 0
        ])
Exemple #2
0
    def test_label_remaining_time_with_elapsed_time_custom_threshold(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.SIMPLE_INDEX.value,
            add_elapsed_time=True,
            add_remaining_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=2)
        labelling = create_test_labelling(
            label_type=LabelTypes.REMAINING_TIME.value,
            threshold_type=ThresholdTypes.THRESHOLD_CUSTOM.value,
            threshold=40000)

        df, _ = encode_label_logs(
            self.test_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 5))
        self.assertListEqual(
            df.columns.values.tolist(),
            ['trace_id', 'prefix_1', 'prefix_2', 'elapsed_time', 'label'])
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(trace_5, ['5', 1, 2, 0, 0])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(trace_4, ['4', 1, 1, 0, 0])
    def test_attribute_number(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.BOOLEAN.value,
            prefix_length=2,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value)
        labelling = create_test_labelling(
            label_type=LabelTypes.ATTRIBUTE_NUMBER.value,
            attribute_name='AMOUNT')

        _, df = encode_label_logs(
            self.test_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 9))
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(
            trace_5,
            ['5', True, True, False, False, False, False, False, False])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(
            trace_4,
            ['4', True, False, True, False, False, False, False, True])
Exemple #4
0
 def test_hyperopt(self):
     job = Job.objects.create(
         split=create_test_split(split_type=SplitTypes.SPLIT_SINGLE.value,
                                 original_log=create_test_log(
                                     log_name=general_example_filename,
                                     log_path=general_example_filepath)),
         encoding=create_test_encoding(
             value_encoding=ValueEncodings.SIMPLE_INDEX.value,
             prefix_length=3,
             padding=False),
         labelling=create_test_labelling(
             label_type=LabelTypes.REMAINING_TIME.value),
         clustering=create_test_clustering(
             clustering_type=ClusteringMethods.KMEANS.value),
         predictive_model=create_test_predictive_model(
             predictive_model=PredictiveModels.CLASSIFICATION.value,
             prediction_method=ClassificationMethods.RANDOM_FOREST.value),
         hyperparameter_optimizer=create_test_hyperparameter_optimizer(
             hyperoptim_type=HyperparameterOptimizationMethods.HYPEROPT.
             value,
             performance_metric=HyperOptLosses.ACC.value,
             max_evals=2))
     prediction_task(job.pk)
     job = Job.objects.get(pk=1)
     self.assertFalse(classification_random_forest(
     ) == job.predictive_model.classification.__getattribute__(
         ClassificationMethods.RANDOM_FOREST.value.lower()).to_dict())
 def test_ValueError_decode(self):
     try:
         encoder = Encoder(df=self.df, encoding=self.encoding)
         encoder.decode(df=self.df,
                        encoding=create_test_encoding(data_encoding='None'))
     except ValueError:
         pass
    def test_rnn_time_series_predictor_complex_no_exceptions(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.COMPLEX.value,
            prefix_length=5,
            padding=True)
        labelling = create_test_labelling(
            label_type=LabelTypes.DURATION.value,
            threshold_type=ThresholdTypes.THRESHOLD_MEAN.value)

        train_df = complex(self.train_log, labelling, encoding,
                           self.train_add_col)
        test_df = complex(self.test_log, labelling, encoding,
                          self.test_add_col)

        train_df, targets_df = self._drop_columns_and_split(train_df)

        test_df, _ = self._drop_columns_and_split(test_df)

        config = self._get_rnn_default_config(
            encoding=ValueEncodings.COMPLEX.value)
        rnn_time_series_predictor = RNNTimeSeriesPredictor(**config)

        # with HidePrints():
        rnn_time_series_predictor.fit(train_df)
        rnn_time_series_predictor.predict(test_df)
 def test_NotImplementedException_init_encoder(self):
     try:
         Encoder(df=self.df,
                 encoding=create_test_encoding(
                     data_encoding=DataEncodings.ONE_HOT_ENCODER.value))
     except NotImplementedError:
         pass
    def test_nn_regressor_complex_no_exceptions(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.COMPLEX.value,
            prefix_length=2,
            padding=True)
        labelling = create_test_labelling(
            label_type=LabelTypes.REMAINING_TIME.value)

        train_df = complex(self.train_log, labelling, encoding,
                           self.train_add_col)
        test_df = complex(self.test_log, labelling, encoding,
                          self.test_add_col)

        train_df, targets_df = self._drop_columns_and_split(train_df)
        targets_df = targets_df.values.ravel()

        test_df, _ = self._drop_columns_and_split(test_df)

        config = self._get_nn_default_config(
            encoding=ValueEncodings.COMPLEX.value)
        nn_regressor = NNRegressor(**config)

        # with HidePrints():
        nn_regressor.fit(train_df, targets_df)
        nn_regressor.predict(test_df)
    def test_next_activity_zero_padding_elapsed_time(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.NEXT_ACTIVITY.value)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.BOOLEAN.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=3)

        _, df = encode_label_logs(
            self.test_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 10))
        self.assertTrue('elapsed_time' in df.columns.values.tolist())
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(trace_5, [
            '5', True, True, True, False, False, False, False, 181200.0,
            'decide'
        ])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(trace_4, [
            '4', True, False, True, False, False, False, True, 171660.0,
            'decide'
        ])
    def test_label_remaining_time_with_elapsed_time_custom_threshold(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.REMAINING_TIME.value,
            threshold_type=ThresholdTypes.THRESHOLD_CUSTOM.value,
            threshold=40000)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.BOOLEAN.value,
            prefix_length=3,
            add_elapsed_time=True,
            add_remaining_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value)

        _, df = encode_label_logs(
            self.test_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 10))
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(trace_5, [
            '5', True, True, True, False, False, False, False, 181200.0, False
        ])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(trace_4, [
            '4', True, False, True, False, False, False, True, 171660.0, False
        ])
 def test_next_activity_kmeans(self):
     self.max_diff = None
     job = create_test_job(
         clustering=create_test_clustering(
             clustering_type=ClusteringMethods.KMEANS.value),
         split=repair_example(),
         encoding=create_test_encoding(prefix_length=8, padding=True),
         labelling=create_test_labelling(
             label_type=LabelTypes.NEXT_ACTIVITY.value),
         predictive_model=create_test_predictive_model(
             predictive_model=PredictiveModels.CLASSIFICATION.value,
             prediction_method=ClassificationMethods.RANDOM_FOREST.value))
     result, _ = calculate(job)
     del result['elapsed_time']
     self.assertDictEqual(
         result, {
             'f1score': 0.54239884582595577,
             'acc': 0.80995475113122173,
             'true_positive': '--',
             'true_negative': '--',
             'false_negative': '--',
             'false_positive': '--',
             'precision': 0.62344720496894401,
             'recall': 0.5224945442336747,
             'auc': 0.4730604801339352
         })
 def test_class_no_cluster(self):
     self.max_diff = None
     job = create_test_job(
         clustering=create_test_clustering(
             clustering_type=ClusteringMethods.NO_CLUSTER.value),
         split=repair_example(),
         encoding=create_test_encoding(prefix_length=5,
                                       padding=True,
                                       add_elapsed_time=True),
         predictive_model=create_test_predictive_model(
             predictive_model=PredictiveModels.CLASSIFICATION.value,
             prediction_method=ClassificationMethods.RANDOM_FOREST.value))
     result, _ = calculate(job)
     del result['elapsed_time']
     self.assertDictEqual(
         result, {
             'f1score': 1.0,
             'acc': 1.0,
             'true_positive': '--',
             'true_negative': '--',
             'false_negative': '--',
             'false_positive': '--',
             'precision': 1.0,
             'recall': 1.0,
             'auc': 0.0
         })
    def test_no_exceptions(self):
        filtered_labels = [enum.value for enum in LabelTypes]

        filtered_classification_methods = [
            enum.value for enum in ClassificationMethods
        ]

        filtered_encoding_methods = [enum.value for enum in ValueEncodings]

        filtered_padding = [True, False]

        choices = [
            filtered_encoding_methods, filtered_padding,
            filtered_classification_methods, filtered_labels
        ]

        job_combinations = list(itertools.product(*choices))

        for (encoding, padding, method, label) in job_combinations:
            print(encoding, padding, method, label)

            if method == 'nn' and (padding == False or label
                                   == LabelTypes.ATTRIBUTE_STRING.value):
                pass
            job = create_test_job(
                predictive_model=create_test_predictive_model(
                    prediction_method=method),
                encoding=create_test_encoding(value_encoding=encoding,
                                              padding=padding),
                labelling=create_test_labelling(label_type=label))
            # with HidePrints():
            calculate(job)
Exemple #14
0
    def test_explain(self):
        split = create_test_split(
            split_type=SplitTypes.SPLIT_DOUBLE.value,
            split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value,
            test_size=0.2,
            original_log=None,
            train_log=create_test_log(
                log_name='train_explainability.xes',
                log_path='cache/log_cache/test_logs/train_explainability.xes'
            ),
            test_log=create_test_log(
                log_name='test_explainability.xes',
                log_path='cache/log_cache/test_logs/test_explainability.xes'
            )
        )

        predictive_model = create_test_predictive_model(
            predictive_model=PredictiveModels.CLASSIFICATION.value,
            prediction_method=ClassificationMethods.DECISION_TREE.value
        )

        job = create_test_job(
            split=split,
            encoding=create_test_encoding(
                prefix_length=4,
                padding=True,
                value_encoding=ValueEncodings.SIMPLE_INDEX.value
            ),
            labelling=create_test_labelling(label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label'),
            clustering=None,
            create_models=True,
            predictive_model=predictive_model,
            job_type=JobTypes.PREDICTION.value,
            hyperparameter_optimizer=None,
            incremental_train=None
        )

        prediction_task(job.id, do_publish_result=False)
        job.refresh_from_db()

        exp = Explanation.objects.get_or_create(
            type=ExplanationTypes.SHAP.value,
            split=split,
            predictive_model=predictive_model,
            job=job,
            results={}
        )[0]
        training_df_old, test_df_old = get_encoded_logs(job)

        explanation_target = '2_101'
        prefix_target = 'prefix_1'

        explanation = explain(exp, training_df_old, test_df_old, explanation_target, prefix_target)
        training_df_old, test_df_old = get_encoded_logs(job)
        explanation_temp = shap_temporal_stability(exp, training_df_old, test_df_old, explanation_target)

        self.assertTrue(type(explanation) is dict)
        self.assertTrue(type(explanation_temp) is dict)
    def test_prefix1_no_elapsed_time(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.FREQUENCY.value,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=1)
        df = frequency(self.log, self.event_names, self.labelling, encoding)

        self.assertEqual(df.shape, (2, 9))
        self.assertNotIn('elapsed_time', df.columns.values.tolist())
 def setUp(self):
     self.log = get_log(create_test_log(log_name=general_example_test_filename,
                                        log_path=general_example_test_filepath))
     self.labelling = create_test_labelling(label_type=LabelTypes.REMAINING_TIME.value)
     self.encoding = create_test_encoding(
         value_encoding=ValueEncodings.SIMPLE_INDEX.value,
         task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
         add_elapsed_time=True,
         prefix_length=1)
Exemple #17
0
    def test_prefix10(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.LAST_PAYLOAD.value,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            add_elapsed_time=True,
            prefix_length=10)
        df = last_payload(self.log, self.labelling, encoding, self.add_col)

        self.assertEqual(df.shape, (1, 17))
        self.assertFalse(df.isnull().values.any())
Exemple #18
0
    def test_prefix10_zero_padding_all_in_one(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.LAST_PAYLOAD.value,
            task_generation_type=TaskGenerationTypes.ALL_IN_ONE.value,
            add_elapsed_time=True,
            prefix_length=10, padding=True)
        df = last_payload(self.log, self.labelling, encoding, self.add_col)

        self.assertEqual(df.shape, (15, 17))
        self.assertFalse(df.isnull().values.any())
 def setUp(self):
     self.train_log = get_log(create_test_log(log_name=general_example_train_filename,
                                              log_path=general_example_train_filepath))
     self.test_log = get_log(create_test_log(log_name=general_example_test_filename,
                                             log_path=general_example_test_filepath_xes))
     self.encoding = create_test_encoding(
         value_encoding=ValueEncodings.SIMPLE_INDEX.value,
         add_elapsed_time=True,
         task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
         prefix_length=1)
 def test_duration(self):
     """Trace atr, zero padding means prefix length has no effect"""
     job = self.get_job()
     job.labelling = create_test_labelling(label_type=LabelTypes.DURATION.value)
     job.save()
     result1, _ = calculate(job)
     job.encoding = create_test_encoding(prefix_length=22, padding=True)
     job.save()
     result2, _ = calculate(job)
     self.assertEqual(result1, result2)
Exemple #21
0
    def get_classification_job(predictive_model: str, prediction_method: str, metric: HyperOptLosses = HyperOptLosses.ACC.value):
        encoding = create_test_encoding(prefix_length=8, padding=True)
        pred_model = create_test_predictive_model(predictive_model=predictive_model,
                                                  prediction_method=prediction_method)
        hyperparameter_optimizer = create_test_hyperparameter_optimizer(performance_metric=metric)

        job = create_test_job(predictive_model=pred_model,
                              encoding=encoding,
                              hyperparameter_optimizer=hyperparameter_optimizer)
        return job
 def setUp(self):
     self.train_log = get_log(
         create_test_log(log_name=general_example_train_filename,
                         log_path=general_example_train_filepath))
     self.test_log = get_log(
         create_test_log(log_name=general_example_test_filename,
                         log_path=general_example_test_filepath))
     self.add_col = get_additional_columns(self.train_log)
     self.encoding = create_test_encoding(
         value_encoding=ValueEncodings.COMPLEX.value,
         add_elapsed_time=True,
         task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
         prefix_length=2)
     self.encodingPadding = create_test_encoding(
         value_encoding=ValueEncodings.COMPLEX.value,
         add_elapsed_time=True,
         task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
         prefix_length=10,
         padding=True)
    def test_prefix10_all_in_one(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.COMPLEX.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ALL_IN_ONE.value,
            prefix_length=10)
        df = complex(self.log, self.labelling, encoding, self.add_col)

        self.assertEqual(df.shape, (10, 55))
        self.assertFalse(df.isnull().values.any())
    def test_prefix5(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.COMPLEX.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=5)
        df = complex(self.log, self.labelling, encoding, self.add_col)

        self.assertEqual(df.shape, (2, 30))
        self.assertFalse(df.isnull().values.any())
Exemple #25
0
 def setUp(self):
     self.log = get_log(create_test_log(log_name=general_example_test_filename,
                                        log_path=general_example_test_filepath))
     self.event_names = unique_events(self.log)
     self.labelling = create_test_labelling(label_type=LabelTypes.REMAINING_TIME.value)
     self.add_col = get_additional_columns(self.log)
     self.encoding = create_test_encoding(
         value_encoding=ValueEncodings.LAST_PAYLOAD.value,
         add_elapsed_time=True,
         task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
         prefix_length=1)
    def test_prefix2(self):
        df = simple_index(self.log, self.labelling, create_test_encoding(
            value_encoding=ValueEncodings.FREQUENCY.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=2))

        self.assertEqual(df.shape, (2, 5))
        row1 = df[df.trace_id == '5'].iloc[0]
        self.assertListEqual(['5', 'register request', 'examine casually', 90840.0, 1485600.0], row1.values.tolist())
        row2 = df[df.trace_id == '4'].iloc[0]
        self.assertListEqual(['4', 'register request', 'check ticket', 75840.0, 445080.0], row2.values.tolist())
Exemple #27
0
    def test_prefix10_padding(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.BOOLEAN.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=10,
            padding=True)
        df = boolean(self.log, self.event_names, self.labelling, encoding)

        self.assertEqual(df.shape, (2, 10))
        row1 = df[df.trace_id == '4'].iloc[0]
        self.assertListEqual(['4', True, False, True, True, False, True, True, 520920.0, 0.0], row1.values.tolist())
Exemple #28
0
    def test_prefix10_padding_all_in_one(self):
        encoding = create_test_encoding(value_encoding=ValueEncodings.BOOLEAN.value,
                                        prefix_length=10,
                                        add_elapsed_time=True,
                                        padding=True,
                                        task_generation_type=ALL_IN_ONE)
        df = boolean(self.log, self.event_names, self.labelling, encoding)

        self.assertEqual(df.shape, (15, 10))
        row1 = df[df.trace_id == '4'].iloc[4]
        self.assertListEqual(['4', True, False, True, True, False, True, True, 520920.0, 0.0], row1.values.tolist())
        self.assertFalse(df.isnull().values.any())
 def setUp(self):
     self.df = DataFrame({
         'literal_feature': [str(item) for item in ['a', 'b', None]],
         'numeric_feature': [str(item) for item in [.1, 1, -.99]],
         'misc_feature': [str(item) for item in ['a', None, -.99]]
     })
     self.how_it_should_be = DataFrame({
         'literal_feature': [2, 3, 1],
         'numeric_feature': [2, 3, 0],
         'misc_feature': [3, 2, 0]
     })
     self.encoding = create_test_encoding()
Exemple #30
0
    def test_shape(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.LAST_PAYLOAD.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=2)
        df = last_payload(self.log, self.labelling, encoding, self.add_col)

        self.assertEqual((2, 9), df.shape)
        headers = ['trace_id', 'prefix_1', 'prefix_2', 'Activity_2', 'Costs_2',
                   'Resource_2', 'org:resource_2', 'elapsed_time', 'label']
        self.assertListEqual(headers, df.columns.values.tolist())