Ejemplo n.º 1
0
def get_encoded_logs(job: Job,
                     use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    print('\tGetting Dataset')
    if use_cache:
        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            training_df, test_df = get_labelled_logs(job)

        else:
            if job.split.train_log is not None and \
                job.split.test_log is not None and \
                LoadedLog.objects.filter(train_log=job.split.train_log.path,
                                         test_log=job.split.test_log.path).exists():
                training_log, test_log, additional_columns = get_loaded_logs(
                    job.split)

            else:
                training_log, test_log, additional_columns = prepare_logs(
                    job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    job.split = duplicate_orm_row(job.split)
                    job.split.type = SplitTypes.SPLIT_DOUBLE.value
                    train_name = '0-' + str(
                        int(100 - (job.split.test_size * 100)))
                    job.split.train_log = create_log(EventLog(training_log),
                                                     train_name + '.xes')
                    test_name = str(int(100 -
                                        (job.split.test_size * 100))) + '-100'
                    job.split.test_log = create_log(EventLog(test_log),
                                                    test_name + '.xes')
                    job.split.additional_columns = str(
                        train_name +
                        test_name)  # TODO: find better naming policy
                    job.save()

                put_loaded_logs(job.split, training_log, test_log,
                                additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = prepare_logs(job.split)
        training_df, test_df = encode_label_logs(
            training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df
Ejemplo n.º 2
0
    def test_label_remaining_time_with_elapsed_time_custom_threshold(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.SIMPLE_INDEX.value,
            add_elapsed_time=True,
            add_remaining_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=2)
        labelling = create_test_labelling(
            label_type=LabelTypes.REMAINING_TIME.value,
            threshold_type=ThresholdTypes.THRESHOLD_CUSTOM.value,
            threshold=40000)

        df, _ = encode_label_logs(
            self.test_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 5))
        self.assertListEqual(
            df.columns.values.tolist(),
            ['trace_id', 'prefix_1', 'prefix_2', 'elapsed_time', 'label'])
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(trace_5, ['5', 1, 2, 0, 0])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(trace_4, ['4', 1, 1, 0, 0])
    def test_label_remaining_time_with_elapsed_time_custom_threshold(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.REMAINING_TIME.value,
            threshold_type=ThresholdTypes.THRESHOLD_CUSTOM.value,
            threshold=40000)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.BOOLEAN.value,
            prefix_length=3,
            add_elapsed_time=True,
            add_remaining_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value)

        _, df = encode_label_logs(
            self.test_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 10))
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(trace_5, [
            '5', True, True, True, False, False, False, False, 181200.0, False
        ])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(trace_4, [
            '4', True, False, True, False, False, False, True, 171660.0, False
        ])
Ejemplo n.º 4
0
    def test_next_activity_zero_padding_elapsed_time(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.NEXT_ACTIVITY.value)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.SIMPLE_INDEX.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=10,
            padding=True)

        df, _ = encode_label_logs(
            self.test_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 13))
        self.assertTrue('elapsed_time' in df.columns.values.tolist())
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(trace_5,
                             ['5', 1, 3, 2, 2, 2, 0, 0, 0, 0, 1296240.0, 2])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(trace_4, [
            '4', 52903968, 32171502, 17803069, 1149821, 72523760, 0, 0, 0, 0,
            0, 520920.0, 0
        ])
    def test_attribute_number(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.BOOLEAN.value,
            prefix_length=2,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value)
        labelling = create_test_labelling(
            label_type=LabelTypes.ATTRIBUTE_NUMBER.value,
            attribute_name='AMOUNT')

        _, df = encode_label_logs(
            self.test_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 9))
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(
            trace_5,
            ['5', True, True, False, False, False, False, False, False])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(
            trace_4,
            ['4', True, False, True, False, False, False, False, True])
    def test_next_activity_zero_padding_elapsed_time(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.NEXT_ACTIVITY.value)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.BOOLEAN.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=3)

        _, df = encode_label_logs(
            self.test_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 10))
        self.assertTrue('elapsed_time' in df.columns.values.tolist())
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(trace_5, [
            '5', True, True, True, False, False, False, False, 181200.0,
            'decide'
        ])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(trace_4, [
            '4', True, False, True, False, False, False, True, 171660.0,
            'decide'
        ])
 def test_shape_training(self):
     training_df, test_df = encode_label_logs(self.training_log, self.test_log, create_test_job(
         encoding=self.encoding,
         labelling=self.labelling,
         predictive_model=create_test_predictive_model(
             predictive_model=PredictiveModels.CLASSIFICATION.value)
     ))
     self.assert_shape(training_df, (4, 4))
     self.assert_shape(test_df, (2, 4))
    def test_no_label(self):
        labelling = create_test_labelling(label_type=LabelTypes.NO_LABEL.value)

        _, df = encode_label_logs(
            self.train_log, self.test_log,
            create_test_job(
                encoding=self.encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 9))
    def test_remaining_time(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.REMAINING_TIME.value)

        _, df = encode_label_logs(
            self.train_log, self.test_log,
            create_test_job(
                encoding=self.encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 11))
    def test_row_test(self):
        training_df, test_df = encode_label_logs(self.training_log, self.test_log, create_test_job(
            encoding=self.encoding,
            labelling=self.labelling,
            predictive_model=create_test_predictive_model(
                predictive_model=PredictiveModels.CLASSIFICATION.value)
        ))
        row = test_df[(test_df.trace_id == '4')].iloc[0]

        self.assertEqual(1, row.prefix_1)
        self.assertEqual(0, row.elapsed_time)
        self.assertEqual(0, row.label)
    def test_next_activity(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.NEXT_ACTIVITY.value)

        _, df = encode_label_logs(
            self.train_log, self.test_log,
            create_test_job(
                encoding=self.encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)),
            get_additional_columns(self.train_log))
        self.assertEqual(df.shape, (2, 14))
    def test_attribute_number(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.ATTRIBUTE_NUMBER.value,
            attribute_name='AMOUNT')

        _, df = encode_label_logs(
            self.test_log, self.test_log,
            create_test_job(
                encoding=self.encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)),
            get_additional_columns(self.test_log))
        self.assertEqual(df.shape, (2, 15))
Ejemplo n.º 13
0
def replay_prediction_calculate(job: Job, log) -> dict:
    """calculate the prediction for the log coming from replayers

    :param job: job idctionary
    :param log: log model
    :return: runtime results
    """

    additional_columns = get_additional_columns(log)
    data_df, _ = train_test_split(log, test_size=0, shuffle=False)
    data_df, _ = encode_label_logs(data_df, EventLog(), job, additional_columns)
    results = MODEL[job.predictive_model.predictive_model][ModelActions.PREDICT.value](job, data_df)
    logger.info("End {} job {}, {} . Results {}".format('runtime', job.predictive_model.predictive_model, get_run(job), results))
    return results
 def setUp(self):
     test_log = get_log(
         create_test_log(log_name=general_example_test_filename,
                         log_path=general_example_test_filepath))
     training_log = get_log(
         create_test_log(log_name=general_example_train_filename,
                         log_path=general_example_train_filepath))
     self.training_df, self.test_df = encode_label_logs(
         training_log, test_log,
         create_test_job(
             encoding=create_test_encoding(
                 value_encoding=ValueEncodings.BOOLEAN.value,
                 add_elapsed_time=True),
             predictive_model=create_test_predictive_model(
                 predictive_model=PredictiveModels.CLASSIFICATION.value)))
Ejemplo n.º 15
0
    def setUp(self):
        test_log = get_log(
            create_test_log(log_name=general_example_test_filename,
                            log_path=general_example_test_filepath_xes))
        training_log = get_log(
            create_test_log(log_name=general_example_train_filename,
                            log_path=general_example_train_filepath))
        self.encoding = create_test_encoding(
            value_encoding=ValueEncodings.FREQUENCY.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=1)
        self.labelling = create_test_labelling(
            label_type=LabelTypes.REMAINING_TIME.value)

        self.training_df, self.test_df = encode_label_logs(
            training_log, test_log,
            create_test_job(encoding=self.encoding, labelling=self.labelling))
    def test_add_new_traces(self):
        labelling = create_test_labelling(label_type=LabelTypes.REMAINING_TIME.value)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.SIMPLE_INDEX.value,
            add_elapsed_time=True,
            add_new_traces=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=1)

        df, _ = encode_label_logs(self.test_log, self.test_log, create_test_job(
            encoding=encoding,
            labelling=labelling,
            predictive_model=create_test_predictive_model(
                predictive_model=PredictiveModels.CLASSIFICATION.value)
        ))
        self.assertEqual(df.shape, (2, 5))
        self.assertTrue('new_traces' in df.columns.values.tolist())
        self.assertListEqual(df['new_traces'].tolist(), [2, 2])
    def test_duration(self):
        labelling = create_test_labelling(label_type=LabelTypes.DURATION.value)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.SIMPLE_INDEX.value,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=2)

        df, _ = encode_label_logs(self.test_log, self.test_log, create_test_job(
            encoding=encoding,
            labelling=labelling,
            predictive_model=create_test_predictive_model(
                predictive_model=PredictiveModels.CLASSIFICATION.value)
        ))
        self.assertEqual(df.shape, (2, 4))
        self.assertListEqual(df.columns.values.tolist(), ['trace_id', 'prefix_1', 'prefix_2', 'label'])
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(trace_5, ['5', 1, 2, 0])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(trace_4, ['4', 1, 1, 0])
Ejemplo n.º 18
0
    def test_no_label(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.SIMPLE_INDEX.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=2)
        labelling = create_test_labelling(label_type=LabelTypes.NO_LABEL.value)

        df, _ = encode_label_logs(
            self.test_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 3))
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(trace_5, ['5', 1, 2])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(trace_4, ['4', 1, 1])
    def test_add_resources_used(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.REMAINING_TIME.value)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.BOOLEAN.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=2,
            add_resources_used=True)

        _, df = encode_label_logs(
            self.train_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)))
        self.assertEqual(df.shape, (2, 12))
        self.assertTrue('resources_used' in df.columns.values.tolist())
        self.assertListEqual(df['resources_used'].tolist(), [1, 1])
    def test_no_label_zero_padding(self):
        # add things have no effect
        labelling = create_test_labelling(label_type=LabelTypes.NO_LABEL.value)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.COMPLEX.value,
            add_elapsed_time=True,
            add_remaining_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=10,
            padding=True)

        _, df = encode_label_logs(
            self.train_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)),
            get_additional_columns(self.train_log))
        self.assertEqual(df.shape, (2, 52))
    def test_add_new_traces(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.REMAINING_TIME.value)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.COMPLEX.value,
            prefix_length=2,
            add_new_traces=True,
            add_elapsed_time=True)

        _, df = encode_label_logs(
            self.train_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)),
            get_additional_columns(self.train_log))
        self.assertEqual(df.shape, (2, 15))
        self.assertTrue('new_traces' in df.columns.values.tolist())
        self.assertListEqual(df['new_traces'].tolist(), [0, 0])
    def test_next_activity_zero_padding_elapsed_time(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.NEXT_ACTIVITY.value)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.COMPLEX.value,
            add_elapsed_time=True,
            add_remaining_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=10,
            padding=True)

        _, df = encode_label_logs(
            self.train_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)),
            get_additional_columns(self.train_log))
        self.assertEqual(df.shape, (2, 55))
        self.assertTrue('elapsed_time' in df.columns.values.tolist())
    def test_remaining_time_zero_padding(self):
        labelling = create_test_labelling(label_type=LabelTypes.REMAINING_TIME.value)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.SIMPLE_INDEX.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=10,
            padding=True)

        df, _ = encode_label_logs(self.test_log, self.test_log, create_test_job(
            encoding=encoding,
            labelling=labelling,
            predictive_model=create_test_predictive_model(
                predictive_model=PredictiveModels.CLASSIFICATION.value)
        ))
        self.assertEqual(df.shape, (2, 13))
        trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist()
        self.assertListEqual(trace_5,
                             ['5', 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0])
        trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist()
        self.assertListEqual(trace_4,
                             ['4', 1, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0])
    def test_label_remaining_time_with_elapsed_time_custom_threshold(self):
        labelling = create_test_labelling(
            label_type=LabelTypes.REMAINING_TIME.value,
            threshold_type=ThresholdTypes.THRESHOLD_CUSTOM.value,
            threshold=40000)
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.COMPLEX.value,
            add_elapsed_time=True,
            add_remaining_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=10,
            padding=True)

        _, df = encode_label_logs(
            self.train_log, self.test_log,
            create_test_job(
                encoding=encoding,
                labelling=labelling,
                predictive_model=create_test_predictive_model(
                    predictive_model=PredictiveModels.CLASSIFICATION.value)),
            get_additional_columns(self.test_log))
        self.assertEqual(df.shape, (2, 55))
    def test_prefix_length_training(self):
        encoding = create_test_encoding(
            value_encoding=ValueEncodings.SIMPLE_INDEX.value,
            add_elapsed_time=True,
            task_generation_type=TaskGenerationTypes.ONLY_THIS.value,
            prefix_length=3)
        training_df, test_df = encode_label_logs(self.training_log, self.test_log, create_test_job(
            encoding=encoding,
            labelling=self.labelling,
            predictive_model=create_test_predictive_model(
                predictive_model=PredictiveModels.CLASSIFICATION.value)
        ))
        self.assertIn("prefix_1", training_df.columns.values)
        self.assertIn("prefix_2", training_df.columns.values)
        self.assertIn("prefix_3", training_df.columns.values)
        self.assertEqual((4, 6), training_df.shape)
        self.assertEqual((2, 6), test_df.shape)

        row = training_df[(training_df.trace_id == '3')].iloc[0]
        self.assertEqual(1, row.prefix_1)
        self.assertEqual(2, row.prefix_2)
        self.assertEqual(1, row.prefix_3)
        self.assertEqual(False, row.label)
        self.assertEqual(0, row.elapsed_time)
Ejemplo n.º 26
0
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    logger.info('\tGetting Dataset')
    if use_cache and \
        (job.predictive_model is not None and
         job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value):
        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            try:
                training_df, test_df = get_labelled_logs(job)
            except FileNotFoundError: #cache invalidation
                LabelledLog.objects.filter(split=job.split,
                                           encoding=job.encoding,
                                           labelling=job.labelling).delete()
                logger.info('\t\tError pre-labeled cache invalidated!')
                return get_encoded_logs(job, use_cache)
        else:
            if job.split.train_log is not None and \
               job.split.test_log is not None and \
               LoadedLog.objects.filter(split=job.split).exists():
                try:
                    training_log, test_log, additional_columns = get_loaded_logs(job.split)
                except FileNotFoundError:  # cache invalidation
                    LoadedLog.objects.filter(split=job.split).delete()
                    logger.info('\t\tError pre-loaded cache invalidated!')
                    return get_encoded_logs(job, use_cache)
            else:
                training_log, test_log, additional_columns = get_train_test_log(job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    job.split = duplicate_orm_row(Split.objects.filter(pk=job.split.pk)[0])
                    job.split.type = SplitTypes.SPLIT_DOUBLE.value
                    train_name = '0-' + str(int(100 - (job.split.test_size * 100)))
                    job.split.train_log = create_log(
                        EventLog(training_log),
                        train_name + '.xes'
                    )
                    test_name = str(int(100 - (job.split.test_size * 100))) + '-100'
                    job.split.test_log = create_log(
                        EventLog(test_log),
                        test_name + '.xes'
                    )
                    job.split.additional_columns = str(train_name + test_name)  # TODO: find better naming policy
                    job.split.save()

                put_loaded_logs(job.split, training_log, test_log, additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = get_train_test_log(job.split)
        training_df, test_df = encode_label_logs(training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df
Ejemplo n.º 27
0
def progetto_padova():
    JOB = Job.objects.get_or_create(
        status=JobStatuses.CREATED.value,
        type=JobTypes.PREDICTION.value,
        split=Split.objects.get_or_create(  # this creates the split of the log
            type=SplitTypes.SPLIT_DOUBLE.value,
            train_log=create_log(  # this imports the log
                import_log(BASE_DIR + RELATIVE_TRAIN_PATH),
                RELATIVE_TRAIN_PATH,
                BASE_DIR,
                import_in_cache=False),
            test_log=create_log(  # this imports the log
                import_log(BASE_DIR + RELATIVE_VALIDATION_PATH),
                RELATIVE_VALIDATION_PATH,
                BASE_DIR,
                import_in_cache=False))[0],
        encoding=Encoding.objects.
        get_or_create(  # this defines the encoding method
            data_encoding=DataEncodings.LABEL_ENCODER.value,
            value_encoding=ValueEncodings.SIMPLE_INDEX.value,
            add_elapsed_time=False,
            add_remaining_time=False,
            add_executed_events=False,
            add_resources_used=False,
            add_new_traces=False,
            prefix_length=5,
            padding=True,
            task_generation_type=TaskGenerationTypes.ALL_IN_ONE.value,
            features=[])[0],
        labelling=Labelling.objects.get_or_create(  # this defines the label
            type=LabelTypes.ATTRIBUTE_STRING.value,
            attribute_name='label',
            threshold_type=None,
            threshold=None)[0],
        clustering=Clustering.init(ClusteringMethods.NO_CLUSTER.value,
                                   configuration={}),
        predictive_model=PredictiveModel.
        init(  # this defines the predictive model
            get_prediction_method_config(
                PredictiveModels.CLASSIFICATION.value,
                ClassificationMethods.DECISION_TREE.value,
                payload={
                    'max_depth': 2,
                    'min_samples_split': 2,
                    'min_samples_leaf': 2
                })),
        hyperparameter_optimizer=HyperparameterOptimization.init(
            {  # this defines the hyperparameter optimisation procedure
                'type': HyperparameterOptimizationMethods.HYPEROPT.value,
                'max_evaluations': 10,
                'performance_metric': HyperOptAlgorithms.TPE.value,
                'algorithm_type': HyperOptLosses.AUC.value
            }),
        create_models=True)[0]

    # load log
    train_log, test_log, additional_columns = get_train_test_log(JOB.split)

    # encode
    train_df, test_df = encode_label_logs(train_log, test_log, JOB)

    # train + evaluate
    results, model_split = MODEL[JOB.predictive_model.predictive_model][
        ModelActions.BUILD_MODEL_AND_TEST.value](train_df, test_df,
                                                 _init_clusterer(
                                                     JOB.clustering, train_df),
                                                 JOB)

    if JOB.create_models:
        save_models(model_split, JOB)

    # predict
    data_df = pd.concat([train_df, test_df])
    results = MODEL[JOB.predictive_model.predictive_model][
        ModelActions.PREDICT.value](JOB, data_df)
    results = MODEL[JOB.predictive_model.predictive_model][
        ModelActions.PREDICT_PROBA.value](JOB, data_df)

    # lime
    exp = Explanation.objects.get_or_create(
        type=ExplanationTypes.LIME.value,
        split=JOB.
        split,  # this defines the analysed log, you can use a different one from the training one
        predictive_model=JOB.predictive_model,
        job=JOB)[0]
    error, result = explanation(exp.id, int(EXPLANATION_TARGET))
Ejemplo n.º 28
0
def replay_core(replay_job: Job, training_initial_job: Job) -> list:
    """The function create a set with timestamps of events, then create a list of requests
        simulating the log in the time passing

        :param replay_job: job dictionary
        :param training_initial_job: job dictionary
        :return: List of requests
    """

    split = replay_job.split
    log = get_log(split.train_log)
    requests_list = list()

    eventlog = EventLog()
    for key in log.attributes.keys():
        eventlog.attributes[key] = log.attributes[key]
    for trace in log:
        new_trace = Trace(trace)
        for key in trace.attributes:
            new_trace.attributes[key] = trace.attributes[key]
        eventlog.append(new_trace)

    times = sorted(
        set([event['time:timestamp'] for trace in eventlog
             for event in trace]))

    for t in times[2::int((len(times) - 2) / 5)]:
        filtered_eventlog = timestamp_filter.apply_events(
            eventlog, times[0].replace(tzinfo=None), t.replace(tzinfo=None))
        trace_list = list()
        event_number = dict()
        for trace in filtered_eventlog:
            trace_list.append(trace.attributes['concept:name'])
            event_number[trace.attributes['concept:name']] = len(trace)
        replay_job.case_id = trace_list
        replay_job.event_number = event_number
        replay_job.save()
        try:  #TODO check logger usage
            logger.info("Sending request for replay_prediction task.")
            r = requests.post(
                url="http://server:8000/runtime/replay_prediction/",
                data=export_log_as_string(filtered_eventlog),
                params={
                    'jobId': replay_job.id,
                    'training_job': training_initial_job.id
                },
                headers={
                    'Content-Type': 'text/plain',
                    'charset': 'UTF-8'
                })
            requests_list.append(str(r))
        except Exception as e:
            requests_list.append(str(e))
            logger.warning(str(e))

    training_log, test_log, additional_columns = get_train_test_log(
        replay_job.split)
    training_df, _ = encode_label_logs(training_log,
                                       test_log,
                                       replay_job,
                                       additional_columns=additional_columns)

    gold_values = dict(zip(training_df['trace_id'], training_df['label']))
    parent_id = replay_job.id
    # final_job = duplicate_orm_row(replay_job)  #todo: replace with simple CREATE
    final_job = Job.objects.create(
        created_date=replay_job.created_date,
        modified_date=replay_job.modified_date,
        error=replay_job.error,
        status=replay_job.status,
        type=replay_job.type,
        create_models=replay_job.create_models,
        case_id=replay_job.case_id,
        event_number=replay_job.event_number,
        gold_value=replay_job.gold_value,
        results=replay_job.results,
        parent_job=replay_job.parent_job,
        split=replay_job.split,
        encoding=replay_job.encoding,
        labelling=replay_job.labelling,
        clustering=replay_job.clustering,
        predictive_model=replay_job.predictive_model,
        evaluation=replay_job.evaluation,
        hyperparameter_optimizer=replay_job.hyperparameter_optimizer,
        incremental_train=replay_job.incremental_train)
    final_job.parent_job = Job.objects.filter(pk=parent_id)[0]
    final_job.gold_value = gold_values
    final_job.type = JobTypes.REPLAY_PREDICT.value
    final_job.save()
    return requests_list