コード例 #1
0
    def test_random(self):
        split = split_single(
            split_ordering=SplitOrderingMethods.SPLIT_RANDOM.value)
        training_log1, _, _ = get_train_test_log(split)
        training_log2, _, _ = get_train_test_log(split)
        training_names1 = trace_names(training_log1)
        training_names2 = trace_names(training_log2)

        self.assertNotEqual(training_names1, training_names2)
コード例 #2
0
def retrieve_proper_encoder(job: Job) -> Encoder:
    if job.incremental_train is not None:
        return retrieve_proper_encoder(job.incremental_train)
    else:
        training_log, test_log, additional_columns = get_train_test_log(job.split)
        training_df, _ = encode_label_logs(training_log, test_log, job, additional_columns=additional_columns,
                                           encode=False)
    return Encoder(training_df, job.encoding)
コード例 #3
0
    def test_sequential(self):
        split = split_single(
            split_ordering=SplitOrderingMethods.SPLIT_SEQUENTIAL.value)
        training_log, test_log, _ = get_train_test_log(split)
        training_names = trace_names(training_log)
        test_names = trace_names(test_log)

        self.assertListEqual(['3', '2', '1', '6'], training_names)
        self.assertListEqual(['5', '4'], test_names)
コード例 #4
0
    def test_strict_temporal(self):
        split = split_single(
            split_ordering=SplitOrderingMethods.SPLIT_STRICT_TEMPORAL.value)
        training_log, test_log, _ = get_train_test_log(split)

        training_names = trace_names(training_log)
        test_names = trace_names(test_log)

        # Modified log to have only one trace here
        self.assertListEqual(['1'], sorted(training_names))
        self.assertListEqual(sorted(['6', '4']), sorted(test_names))
コード例 #5
0
    def test_temporal(self):
        split = split_single(
            split_ordering=SplitOrderingMethods.SPLIT_TEMPORAL.value)
        training_log, test_log, _ = get_train_test_log(split)

        training_names = trace_names(training_log)
        test_names = trace_names(test_log)

        self.assertListEqual(sorted(['1', '2', '3', '5']),
                             sorted(training_names))
        self.assertListEqual(sorted(['6', '4']), sorted(test_names))
コード例 #6
0
ファイル: common.py プロジェクト: stebranchi/predict-python
def get_encoded_logs(job: Job,
                     use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    logger.info('\tGetting Dataset')

    if use_cache and \
        (job.predictive_model is not None and
         job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value):

        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            try:
                training_df, test_df = get_labelled_logs(job)
            except FileNotFoundError:  #cache invalidation
                LabelledLog.objects.filter(split=job.split,
                                           encoding=job.encoding,
                                           labelling=job.labelling).delete()
                logger.info('\t\tError pre-labeled cache invalidated!')
                return get_encoded_logs(job, use_cache)
        else:
            if job.split.train_log is not None and \
               job.split.test_log is not None and \
               LoadedLog.objects.filter(split=job.split).exists():
                try:
                    training_log, test_log, additional_columns = get_loaded_logs(
                        job.split)
                except FileNotFoundError:  # cache invalidation
                    LoadedLog.objects.filter(split=job.split).delete()
                    logger.info('\t\tError pre-loaded cache invalidated!')
                    return get_encoded_logs(job, use_cache)
            else:
                training_log, test_log, additional_columns = get_train_test_log(
                    job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    search_for_already_existing_split = Split.objects.filter(
                        type=SplitTypes.SPLIT_DOUBLE.value,
                        original_log=job.split.original_log,
                        test_size=job.split.test_size,
                        splitting_method=job.split.splitting_method)
                    if len(search_for_already_existing_split) >= 1:
                        job.split = search_for_already_existing_split[0]
                        job.split.save()
                        job.save()
                        return get_encoded_logs(job, use_cache=use_cache)
                    else:
                        # job.split = duplicate_orm_row(Split.objects.filter(pk=job.split.pk)[0])  #todo: replace with simple CREATE
                        job.split = Split.objects.create(
                            type=job.split.type,
                            original_log=job.split.original_log,
                            test_size=job.split.test_size,
                            splitting_method=job.split.splitting_method,
                            train_log=job.split.train_log,
                            test_log=job.split.test_log,
                            additional_columns=job.split.additional_columns
                        )  #todo: futurebug if object changes
                        job.split.type = SplitTypes.SPLIT_DOUBLE.value
                        train_name = 'SPLITTED_' + job.split.original_log.name.split(
                            '.')[0] + '_0-' + str(
                                int(100 - (job.split.test_size * 100)))
                        job.split.train_log = create_log(
                            EventLog(training_log), train_name + '.xes')
                        test_name = 'SPLITTED_' + job.split.original_log.name.split(
                            '.')[0] + '_' + str(
                                int(100 -
                                    (job.split.test_size * 100))) + '-100'
                        job.split.test_log = create_log(
                            EventLog(test_log), test_name + '.xes')
                        job.split.additional_columns = str(
                            train_name +
                            test_name)  # TODO: find better naming policy
                        job.split.save()

                put_loaded_logs(job.split, training_log, test_log,
                                additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = get_train_test_log(
            job.split)
        training_df, test_df = encode_label_logs(
            training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df
コード例 #7
0
def progetto_padova():
    JOB = Job.objects.get_or_create(
        status=JobStatuses.CREATED.value,
        type=JobTypes.PREDICTION.value,
        split=Split.objects.get_or_create(  # this creates the split of the log
            type=SplitTypes.SPLIT_DOUBLE.value,
            train_log=create_log(  # this imports the log
                import_log(BASE_DIR + RELATIVE_TRAIN_PATH),
                RELATIVE_TRAIN_PATH,
                BASE_DIR,
                import_in_cache=False),
            test_log=create_log(  # this imports the log
                import_log(BASE_DIR + RELATIVE_VALIDATION_PATH),
                RELATIVE_VALIDATION_PATH,
                BASE_DIR,
                import_in_cache=False))[0],
        encoding=Encoding.objects.
        get_or_create(  # this defines the encoding method
            data_encoding=DataEncodings.LABEL_ENCODER.value,
            value_encoding=ValueEncodings.SIMPLE_INDEX.value,
            add_elapsed_time=False,
            add_remaining_time=False,
            add_executed_events=False,
            add_resources_used=False,
            add_new_traces=False,
            prefix_length=5,
            padding=True,
            task_generation_type=TaskGenerationTypes.ALL_IN_ONE.value,
            features=[])[0],
        labelling=Labelling.objects.get_or_create(  # this defines the label
            type=LabelTypes.ATTRIBUTE_STRING.value,
            attribute_name='label',
            threshold_type=None,
            threshold=None)[0],
        clustering=Clustering.init(ClusteringMethods.NO_CLUSTER.value,
                                   configuration={}),
        predictive_model=PredictiveModel.
        init(  # this defines the predictive model
            get_prediction_method_config(
                PredictiveModels.CLASSIFICATION.value,
                ClassificationMethods.DECISION_TREE.value,
                payload={
                    'max_depth': 2,
                    'min_samples_split': 2,
                    'min_samples_leaf': 2
                })),
        hyperparameter_optimizer=HyperparameterOptimization.init(
            {  # this defines the hyperparameter optimisation procedure
                'type': HyperparameterOptimizationMethods.HYPEROPT.value,
                'max_evaluations': 10,
                'performance_metric': HyperOptAlgorithms.TPE.value,
                'algorithm_type': HyperOptLosses.AUC.value
            }),
        create_models=True)[0]

    # load log
    train_log, test_log, additional_columns = get_train_test_log(JOB.split)

    # encode
    train_df, test_df = encode_label_logs(train_log, test_log, JOB)

    # train + evaluate
    results, model_split = MODEL[JOB.predictive_model.predictive_model][
        ModelActions.BUILD_MODEL_AND_TEST.value](train_df, test_df,
                                                 _init_clusterer(
                                                     JOB.clustering, train_df),
                                                 JOB)

    if JOB.create_models:
        save_models(model_split, JOB)

    # predict
    data_df = pd.concat([train_df, test_df])
    results = MODEL[JOB.predictive_model.predictive_model][
        ModelActions.PREDICT.value](JOB, data_df)
    results = MODEL[JOB.predictive_model.predictive_model][
        ModelActions.PREDICT_PROBA.value](JOB, data_df)

    # lime
    exp = Explanation.objects.get_or_create(
        type=ExplanationTypes.LIME.value,
        split=JOB.
        split,  # this defines the analysed log, you can use a different one from the training one
        predictive_model=JOB.predictive_model,
        job=JOB)[0]
    error, result = explanation(exp.id, int(EXPLANATION_TARGET))
コード例 #8
0
 def test_size(self):
     split = split_single(test_size=0.5)
     training_log, test_log, _ = get_train_test_log(split)
     self.assertEqual(3, len(training_log))
     self.assertEqual(3, len(test_log))
コード例 #9
0
 def test_split_double(self):
     training_log, test_log, _ = get_train_test_log(split_double())
     self.assertEqual(4, len(training_log))
     self.assertEqual(2, len(test_log))
コード例 #10
0
def replay_core(replay_job: Job, training_initial_job: Job) -> list:
    """The function create a set with timestamps of events, then create a list of requests
        simulating the log in the time passing

        :param replay_job: job dictionary
        :param training_initial_job: job dictionary
        :return: List of requests
    """

    split = replay_job.split
    log = get_log(split.train_log)
    requests_list = list()

    eventlog = EventLog()
    for key in log.attributes.keys():
        eventlog.attributes[key] = log.attributes[key]
    for trace in log:
        new_trace = Trace(trace)
        for key in trace.attributes:
            new_trace.attributes[key] = trace.attributes[key]
        eventlog.append(new_trace)

    times = sorted(
        set([event['time:timestamp'] for trace in eventlog
             for event in trace]))

    for t in times[2::int((len(times) - 2) / 5)]:
        filtered_eventlog = timestamp_filter.apply_events(
            eventlog, times[0].replace(tzinfo=None), t.replace(tzinfo=None))
        trace_list = list()
        event_number = dict()
        for trace in filtered_eventlog:
            trace_list.append(trace.attributes['concept:name'])
            event_number[trace.attributes['concept:name']] = len(trace)
        replay_job.case_id = trace_list
        replay_job.event_number = event_number
        replay_job.save()
        try:  #TODO check logger usage
            logger.info("Sending request for replay_prediction task.")
            r = requests.post(
                url="http://server:8000/runtime/replay_prediction/",
                data=export_log_as_string(filtered_eventlog),
                params={
                    'jobId': replay_job.id,
                    'training_job': training_initial_job.id
                },
                headers={
                    'Content-Type': 'text/plain',
                    'charset': 'UTF-8'
                })
            requests_list.append(str(r))
        except Exception as e:
            requests_list.append(str(e))
            logger.warning(str(e))

    training_log, test_log, additional_columns = get_train_test_log(
        replay_job.split)
    training_df, _ = encode_label_logs(training_log,
                                       test_log,
                                       replay_job,
                                       additional_columns=additional_columns)

    gold_values = dict(zip(training_df['trace_id'], training_df['label']))
    parent_id = replay_job.id
    # final_job = duplicate_orm_row(replay_job)  #todo: replace with simple CREATE
    final_job = Job.objects.create(
        created_date=replay_job.created_date,
        modified_date=replay_job.modified_date,
        error=replay_job.error,
        status=replay_job.status,
        type=replay_job.type,
        create_models=replay_job.create_models,
        case_id=replay_job.case_id,
        event_number=replay_job.event_number,
        gold_value=replay_job.gold_value,
        results=replay_job.results,
        parent_job=replay_job.parent_job,
        split=replay_job.split,
        encoding=replay_job.encoding,
        labelling=replay_job.labelling,
        clustering=replay_job.clustering,
        predictive_model=replay_job.predictive_model,
        evaluation=replay_job.evaluation,
        hyperparameter_optimizer=replay_job.hyperparameter_optimizer,
        incremental_train=replay_job.incremental_train)
    final_job.parent_job = Job.objects.filter(pk=parent_id)[0]
    final_job.gold_value = gold_values
    final_job.type = JobTypes.REPLAY_PREDICT.value
    final_job.save()
    return requests_list