Example #1
0
def get_encoded_logs(job: Job,
                     use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    print('\tGetting Dataset')
    if use_cache:
        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            training_df, test_df = get_labelled_logs(job)

        else:
            if job.split.train_log is not None and \
                job.split.test_log is not None and \
                LoadedLog.objects.filter(train_log=job.split.train_log.path,
                                         test_log=job.split.test_log.path).exists():
                training_log, test_log, additional_columns = get_loaded_logs(
                    job.split)

            else:
                training_log, test_log, additional_columns = prepare_logs(
                    job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    job.split = duplicate_orm_row(job.split)
                    job.split.type = SplitTypes.SPLIT_DOUBLE.value
                    train_name = '0-' + str(
                        int(100 - (job.split.test_size * 100)))
                    job.split.train_log = create_log(EventLog(training_log),
                                                     train_name + '.xes')
                    test_name = str(int(100 -
                                        (job.split.test_size * 100))) + '-100'
                    job.split.test_log = create_log(EventLog(test_log),
                                                    test_name + '.xes')
                    job.split.additional_columns = str(
                        train_name +
                        test_name)  # TODO: find better naming policy
                    job.save()

                put_loaded_logs(job.split, training_log, test_log,
                                additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = prepare_logs(job.split)
        training_df, test_df = encode_label_logs(
            training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df
Example #2
0
def get_encoded_logs(job: Job,
                     use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    logger.info('\tGetting Dataset')

    if use_cache and \
        (job.predictive_model is not None and
         job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value):

        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            try:
                training_df, test_df = get_labelled_logs(job)
            except FileNotFoundError:  #cache invalidation
                LabelledLog.objects.filter(split=job.split,
                                           encoding=job.encoding,
                                           labelling=job.labelling).delete()
                logger.info('\t\tError pre-labeled cache invalidated!')
                return get_encoded_logs(job, use_cache)
        else:
            if job.split.train_log is not None and \
               job.split.test_log is not None and \
               LoadedLog.objects.filter(split=job.split).exists():
                try:
                    training_log, test_log, additional_columns = get_loaded_logs(
                        job.split)
                except FileNotFoundError:  # cache invalidation
                    LoadedLog.objects.filter(split=job.split).delete()
                    logger.info('\t\tError pre-loaded cache invalidated!')
                    return get_encoded_logs(job, use_cache)
            else:
                training_log, test_log, additional_columns = get_train_test_log(
                    job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    search_for_already_existing_split = Split.objects.filter(
                        type=SplitTypes.SPLIT_DOUBLE.value,
                        original_log=job.split.original_log,
                        test_size=job.split.test_size,
                        splitting_method=job.split.splitting_method)
                    if len(search_for_already_existing_split) >= 1:
                        job.split = search_for_already_existing_split[0]
                        job.split.save()
                        job.save()
                        return get_encoded_logs(job, use_cache=use_cache)
                    else:
                        # job.split = duplicate_orm_row(Split.objects.filter(pk=job.split.pk)[0])  #todo: replace with simple CREATE
                        job.split = Split.objects.create(
                            type=job.split.type,
                            original_log=job.split.original_log,
                            test_size=job.split.test_size,
                            splitting_method=job.split.splitting_method,
                            train_log=job.split.train_log,
                            test_log=job.split.test_log,
                            additional_columns=job.split.additional_columns
                        )  #todo: futurebug if object changes
                        job.split.type = SplitTypes.SPLIT_DOUBLE.value
                        train_name = 'SPLITTED_' + job.split.original_log.name.split(
                            '.')[0] + '_0-' + str(
                                int(100 - (job.split.test_size * 100)))
                        job.split.train_log = create_log(
                            EventLog(training_log), train_name + '.xes')
                        test_name = 'SPLITTED_' + job.split.original_log.name.split(
                            '.')[0] + '_' + str(
                                int(100 -
                                    (job.split.test_size * 100))) + '-100'
                        job.split.test_log = create_log(
                            EventLog(test_log), test_name + '.xes')
                        job.split.additional_columns = str(
                            train_name +
                            test_name)  # TODO: find better naming policy
                        job.split.save()

                put_loaded_logs(job.split, training_log, test_log,
                                additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = get_train_test_log(
            job.split)
        training_df, test_df = encode_label_logs(
            training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df