Ejemplo n.º 1
0
 def test_split_avoid_duplication(self):
     split = create_test_split(
         split_type=SplitTypes.SPLIT_SINGLE.value,
         split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value,
         test_size=0.2,
         original_log=self.log)
     job = create_test_job(split=split,
                           encoding=self.encoding,
                           labelling=self.labelling,
                           clustering=None,
                           create_models=False,
                           predictive_model=self.predictive_model,
                           job_type=JobTypes.PREDICTION.value,
                           hyperparameter_optimizer=None,
                           incremental_train=None)
     training_df1, test_df1 = get_encoded_logs(job)
     split_id1 = job.split.id
     job = create_test_job(split=split,
                           encoding=self.encoding,
                           labelling=self.labelling,
                           clustering=None,
                           create_models=False,
                           predictive_model=self.predictive_model,
                           job_type=JobTypes.PREDICTION.value,
                           hyperparameter_optimizer=None,
                           incremental_train=None)
     training_df2, test_df2 = get_encoded_logs(job)
     split_id2 = job.split.id
     self.assertEqual(split_id1, split_id2)
Ejemplo n.º 2
0
    def test_get_encoded_logs_Loaded_cache(self):
        job = create_test_job()

        w_cache = get_encoded_logs(job, True)

        cached_loaded_log = LoadedLog.objects.filter(split=job.split)[0]

        cached_train = cached_loaded_log.train_log_path
        cached_test = cached_loaded_log.test_log_path

        os.remove('cache/loaded_log_cache/' + get_digested(cached_train) +
                  '.pickle')

        loaded_from_cache = get_encoded_logs(job, True)

        assert_frame_equal(w_cache[0], loaded_from_cache[0])
        assert_frame_equal(w_cache[1], loaded_from_cache[1])

        os.remove('cache/loaded_log_cache/' + get_digested(cached_test) +
                  '.pickle')

        loaded_from_cache = get_encoded_logs(job, True)

        assert_frame_equal(w_cache[0], loaded_from_cache[0])
        assert_frame_equal(w_cache[1], loaded_from_cache[1])
Ejemplo n.º 3
0
    def test_explain(self):
        split = create_test_split(
            split_type=SplitTypes.SPLIT_DOUBLE.value,
            split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value,
            test_size=0.2,
            original_log=None,
            train_log=create_test_log(
                log_name='train_explainability.xes',
                log_path='cache/log_cache/test_logs/train_explainability.xes'
            ),
            test_log=create_test_log(
                log_name='test_explainability.xes',
                log_path='cache/log_cache/test_logs/test_explainability.xes'
            )
        )

        predictive_model = create_test_predictive_model(
            predictive_model=PredictiveModels.CLASSIFICATION.value,
            prediction_method=ClassificationMethods.DECISION_TREE.value
        )

        job = create_test_job(
            split=split,
            encoding=create_test_encoding(
                prefix_length=4,
                padding=True,
                value_encoding=ValueEncodings.SIMPLE_INDEX.value
            ),
            labelling=create_test_labelling(label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label'),
            clustering=None,
            create_models=True,
            predictive_model=predictive_model,
            job_type=JobTypes.PREDICTION.value,
            hyperparameter_optimizer=None,
            incremental_train=None
        )

        prediction_task(job.id, do_publish_result=False)
        job.refresh_from_db()

        exp = Explanation.objects.get_or_create(
            type=ExplanationTypes.SHAP.value,
            split=split,
            predictive_model=predictive_model,
            job=job,
            results={}
        )[0]
        training_df_old, test_df_old = get_encoded_logs(job)

        explanation_target = '2_101'
        prefix_target = 'prefix_1'

        explanation = explain(exp, training_df_old, test_df_old, explanation_target, prefix_target)
        training_df_old, test_df_old = get_encoded_logs(job)
        explanation_temp = shap_temporal_stability(exp, training_df_old, test_df_old, explanation_target)

        self.assertTrue(type(explanation) is dict)
        self.assertTrue(type(explanation_temp) is dict)
Ejemplo n.º 4
0
    def test_get_encoded_logs_cache(self):
        job = create_test_job()

        w_cache = get_encoded_logs(job, True)
        wout_cache = get_encoded_logs(job, False)

        assert_frame_equal(w_cache[0], wout_cache[0])
        assert_frame_equal(w_cache[1], wout_cache[1])

        loaded_from_cache = get_encoded_logs(job, True)

        assert_frame_equal(w_cache[0], loaded_from_cache[0])
        assert_frame_equal(w_cache[1], loaded_from_cache[1])
Ejemplo n.º 5
0
    def test_get_labelled_logs(self):
        job = create_test_job()
        labelled_logs = get_encoded_logs(job)

        cached_labelled_logs = get_labelled_logs(job)

        assert_frame_equal(labelled_logs[0], cached_labelled_logs[0])
        assert_frame_equal(labelled_logs[1], cached_labelled_logs[1])
Ejemplo n.º 6
0
def get_decoded_df(request, pk):
    job = Job.objects.filter(pk=pk)[0]
    training_df, test_df = get_encoded_logs(job)
    training_df = training_df[:100]
    training_df = training_df.drop(['trace_id'], 1)
    encoder = retrieve_proper_encoder(job)
    encoder.decode(training_df, job.encoding)
    return Response(training_df, status=200)
Ejemplo n.º 7
0
def calculate(job: Job) -> (dict, dict): #TODO dd filter for 'valid' configurations
    """main entry point for calculations

    encodes the logs based on the given configuration and runs the selected task
    :param job: job configuration
    :return: results and predictive_model split

    """
    logger.info("Start job {} with {}".format(job.type, get_run(job)))
    training_df, test_df = get_encoded_logs(job)
    results, model_split = run_by_type(training_df, test_df, job)
    return results, model_split
Ejemplo n.º 8
0
def runtime_calculate(job: Job) -> dict:
    """calculate the prediction for traces in the uncompleted logs

    :param job: job idctionary
    :return: runtime results
    """

    training_df, test_df = get_encoded_logs(job)
    data_df = pd.concat([training_df,test_df])
    results = MODEL[job.predictive_model.predictive_model][ModelActions.PREDICT.value](job, data_df)
    logger.info("End {} job {}, {} . Results {}".format('runtime', job.predictive_model.predictive_model, get_run(job), results))
    return results
Ejemplo n.º 9
0
def explanation_temporal_stability(exp_id: int,
                                   explanation_target: str = None):
    exp = Explanation.objects.filter(pk=exp_id)[0]
    job = exp.job
    # load data
    training_df, test_df = get_encoded_logs(job)

    result = EXPLANATION[exp.type][TEMPORAL_STABILITY](exp, training_df,
                                                       test_df,
                                                       explanation_target)

    return 'False', result
Ejemplo n.º 10
0
def explanation(exp_id: int,
                explanation_target: str = None,
                prefix_target: str = None):
    exp = Explanation.objects.filter(pk=exp_id)[0]
    job = exp.job
    # load data
    training_df, test_df = get_encoded_logs(job)

    result = EXPLANATION[exp.type][EXPLAIN](exp, training_df, test_df,
                                            explanation_target, prefix_target)

    return 'False', result
Ejemplo n.º 11
0
def get_unique_values(request, pk):
    job = Job.objects.filter(pk=pk)[0]
    training_df, test_df = get_encoded_logs(job)
    decoded_training_df = training_df.copy()
    decoded_testing_df = test_df.copy()
    training_df = training_df.drop(['trace_id', 'label'], 1)

    encoder = retrieve_proper_encoder(job)
    encoder.decode(df=decoded_training_df, encoding=job.encoding)
    encoder.decode(df=decoded_testing_df, encoding=job.encoding)

    result_df = {}
    for key in training_df.keys():
        result_decoded_df = list(
            set(list(training_df[key]) + list(test_df[key])))
        result_encoded_df = list(
            set(
                list(decoded_training_df[key]) +
                list(decoded_testing_df[key])))

        result_df[key] = {}
        for k in range(len(result_decoded_df)):
            result_df[key][result_encoded_df[k]] = result_decoded_df[k]
    return Response(result_df, status=200)
Ejemplo n.º 12
0
def calculate_hyperopt(job: Job) -> (dict, dict, dict):
    """main entry method for hyperopt calculations
    returns the predictive_model for the best trial

    :param job: job configuration
    :return: tuple containing the results, config and predictive_model split from the search
    """

    logger.info("Start hyperopt job {} with {}, performance_metric {}".format(
        job.type, get_run(job),
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()).
        performance_metric)  #Todo: WHY DO I NEED TO GET HYPEROPT?
                )

    global train_df, validation_df, test_df, global_job
    global_job = job
    train_df, test_df = get_encoded_logs(job)
    train_df, validation_df, test_df = _retrieve_train_validate_test(
        train_df, test_df)

    train_start_time = time.time()

    space = _get_space(job)

    max_evaluations = job.hyperparameter_optimizer.__getattribute__(
        job.hyperparameter_optimizer.optimization_method.lower(
        )).max_evaluations  #Todo: WHY DO I NEED TO GET HYPEROPT?
    trials = Trials()

    algorithm = algorithm = OPTIMISATION_ALGORITHM[
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower(
            )).algorithm_type]
    _run_hyperoptimisation(space, algorithm.suggest, max_evaluations, trials)

    best_candidate = trials.best_trial['result']

    job.predictive_model = PredictiveModel.objects.filter(
        pk=best_candidate['predictive_model_id'])[0]
    job.predictive_model.save()
    job.save()

    best_candidate['results']['elapsed_time'] = timedelta(
        seconds=time.time() -
        train_start_time)  # todo find better place for this
    job.evaluation.elapsed_time = best_candidate['results']['elapsed_time']
    job.evaluation.save()

    results_df, auc = _test_best_candidate(
        best_candidate, job.labelling.type,
        job.predictive_model.predictive_model)
    if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value:
        results = classification_prepare_results(results_df, auc)
    else:
        results = regression_prepare_results(results_df, job.labelling)
    results['elapsed_time'] = job.evaluation.elapsed_time
    job.evaluation = Evaluation.init(job.predictive_model.predictive_model,
                                     results,
                                     len(set(validation_df['label'])) <= 2)
    job.evaluation.save()
    job.save()

    logger.info(
        "End hyperopt job {}, {}. \n\tResults on validation {}. \n\tResults on test {}."
        .format(job.type, get_run(job), best_candidate['results'], results))  #
    return results, best_candidate['config'], best_candidate['model_split']
Ejemplo n.º 13
0
    def test_explain(self):
        split = create_test_split(
            split_type=SplitTypes.SPLIT_DOUBLE.value,
            split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value,
            test_size=0.2,
            original_log=None,
            train_log=create_test_log(
                log_name='train_explainability.xes',
                log_path='cache/log_cache/test_logs/train_explainability.xes'),
            test_log=create_test_log(
                log_name='test_explainability.xes',
                log_path='cache/log_cache/test_logs/test_explainability.xes'))

        predictive_model = create_test_predictive_model(
            predictive_model=PredictiveModels.CLASSIFICATION.value,
            prediction_method=ClassificationMethods.DECISION_TREE.value)

        job = create_test_job(
            split=split,
            encoding=create_test_encoding(
                prefix_length=4,
                padding=True,
                value_encoding=ValueEncodings.SIMPLE_INDEX.value),
            labelling=create_test_labelling(
                label_type=LabelTypes.ATTRIBUTE_STRING.value,
                attribute_name='label'),
            clustering=None,
            create_models=True,
            predictive_model=predictive_model,
            job_type=JobTypes.PREDICTION.value,
            hyperparameter_optimizer=None,
            incremental_train=None)

        prediction_task(job.id, do_publish_result=False)
        job.refresh_from_db()

        exp = Explanation.objects.get_or_create(
            type=ExplanationTypes.ICE.value,
            split=split,
            predictive_model=predictive_model,
            job=job,
            results={})[0]
        training_df_old, test_df_old = get_encoded_logs(job)

        explanation_target = 'prefix_2'

        explanation = explain(exp,
                              training_df_old,
                              test_df_old,
                              explanation_target,
                              prefix_target=None)

        expected = [{
            'value': 'Contact Hospital',
            'label': 1.2962962962962963,
            'count': 351
        }, {
            'value': 'Create Questionnaire',
            'label': 1.5526992287917738,
            'count': 1167
        }, {
            'value': 'High Insurance Check',
            'label': 1.2667660208643816,
            'count': 671
        }]

        self.assertEqual(expected, explanation)