Example #1
0
def get_encoded_logs(job: Job,
                     use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    print('\tGetting Dataset')
    if use_cache:
        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            training_df, test_df = get_labelled_logs(job)

        else:
            if job.split.train_log is not None and \
                job.split.test_log is not None and \
                LoadedLog.objects.filter(train_log=job.split.train_log.path,
                                         test_log=job.split.test_log.path).exists():
                training_log, test_log, additional_columns = get_loaded_logs(
                    job.split)

            else:
                training_log, test_log, additional_columns = prepare_logs(
                    job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    job.split = duplicate_orm_row(job.split)
                    job.split.type = SplitTypes.SPLIT_DOUBLE.value
                    train_name = '0-' + str(
                        int(100 - (job.split.test_size * 100)))
                    job.split.train_log = create_log(EventLog(training_log),
                                                     train_name + '.xes')
                    test_name = str(int(100 -
                                        (job.split.test_size * 100))) + '-100'
                    job.split.test_log = create_log(EventLog(test_log),
                                                    test_name + '.xes')
                    job.split.additional_columns = str(
                        train_name +
                        test_name)  # TODO: find better naming policy
                    job.save()

                put_loaded_logs(job.split, training_log, test_log,
                                additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = prepare_logs(job.split)
        training_df, test_df = encode_label_logs(
            training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df
Example #2
0
def upload_multiple(request):
    print('Double upload request received.')
    test_log = create_log(request.FILES['testSet'], request.FILES['testSet'].name)
    train_log = create_log(request.FILES['trainingSet'], request.FILES['trainingSet'].name)

    item = Split.objects.create(
        type='double',
        train_log=train_log,
        test_log=test_log)
    serializer = SplitSerializer(item)
    return Response(serializer.data, status=status.HTTP_201_CREATED)
Example #3
0
def get_train_test_log(split: Split):
    """Returns training_log and test_log"""
    if split.type == SplitTypes.SPLIT_SINGLE.value and Split.objects.filter(
        type=SplitTypes.SPLIT_DOUBLE.value,
        original_log=split.original_log,
        test_size=split.test_size,
        splitting_method=split.splitting_method
    ).exists() and split.splitting_method != SplitOrderingMethods.SPLIT_RANDOM.value:
        return get_train_test_log(Split.objects.filter(
            type=SplitTypes.SPLIT_DOUBLE.value,
            original_log=split.original_log,
            test_size=split.test_size,
            splitting_method=split.splitting_method
        )[0])
    elif split.original_log is not None and (not Split.objects.filter(
        type=SplitTypes.SPLIT_DOUBLE.value,
        original_log=split.original_log,
        test_size=split.test_size,
        splitting_method=split.splitting_method
    ).exists() or split.splitting_method == SplitOrderingMethods.SPLIT_RANDOM.value):
        training_log, test_log = _split_single_log(split)
        additional_columns = get_additional_columns(get_log(split.original_log))

        if split.splitting_method != SplitOrderingMethods.SPLIT_RANDOM.value:
            _ = Split.objects.get_or_create(
                type=SplitTypes.SPLIT_DOUBLE.value,
                original_log=split.original_log,
                test_size=split.test_size,
                splitting_method=split.splitting_method,
                train_log=create_log(EventLog(training_log), '0-' + str(100 - int(split.test_size * 100)) + '.xes'),
                test_log=create_log(EventLog(test_log), str(100 - int(split.test_size * 100)) + '-100.xes'),
                additional_columns=split.additional_columns
            )[0]

        logger.info("\t\tLoaded single log from {}".format(split.original_log.path))
    else:
        # Have to use sklearn to convert some internal data types
        training_log = get_log(split.train_log)
        additional_columns = get_additional_columns(training_log)
        if split.additional_columns is None:
            split.additional_columns = split.train_log.name + split.test_log.name + '_ac.xes'
            split.save()
        training_log, train_log_to_append = train_test_split(training_log, test_size=0, shuffle=False)
        test_log, test_log_to_append = train_test_split(get_log(split.test_log), test_size=0, shuffle=False)
        logger.info("\t\tLoaded double logs from {} and {}.".format(split.train_log.path, split.test_log.path))
    if len(training_log) == 0:
        raise TypeError("Training log is empty. Create a new Split with better parameters")
    return training_log, test_log, additional_columns
def progetto_padova():
    JOB = Job.objects.get_or_create(
        status=JobStatuses.CREATED.value,
        type=JobTypes.PREDICTION.value,
        split=Split.objects.get_or_create(  # this creates the split of the log
            type=SplitTypes.SPLIT_DOUBLE.value,
            train_log=create_log(  # this imports the log
                import_log(BASE_DIR + RELATIVE_TRAIN_PATH),
                RELATIVE_TRAIN_PATH,
                BASE_DIR,
                import_in_cache=False),
            test_log=create_log(  # this imports the log
                import_log(BASE_DIR + RELATIVE_VALIDATION_PATH),
                RELATIVE_VALIDATION_PATH,
                BASE_DIR,
                import_in_cache=False))[0],
        encoding=Encoding.objects.
        get_or_create(  # this defines the encoding method
            data_encoding=DataEncodings.LABEL_ENCODER.value,
            value_encoding=ValueEncodings.SIMPLE_INDEX.value,
            add_elapsed_time=False,
            add_remaining_time=False,
            add_executed_events=False,
            add_resources_used=False,
            add_new_traces=False,
            prefix_length=5,
            padding=True,
            task_generation_type=TaskGenerationTypes.ALL_IN_ONE.value,
            features=[])[0],
        labelling=Labelling.objects.get_or_create(  # this defines the label
            type=LabelTypes.ATTRIBUTE_STRING.value,
            attribute_name='label',
            threshold_type=None,
            threshold=None)[0],
        clustering=Clustering.init(ClusteringMethods.NO_CLUSTER.value,
                                   configuration={}),
        predictive_model=PredictiveModel.
        init(  # this defines the predictive model
            get_prediction_method_config(
                PredictiveModels.CLASSIFICATION.value,
                ClassificationMethods.DECISION_TREE.value,
                payload={
                    'max_depth': 2,
                    'min_samples_split': 2,
                    'min_samples_leaf': 2
                })),
        hyperparameter_optimizer=HyperparameterOptimization.init(
            {  # this defines the hyperparameter optimisation procedure
                'type': HyperparameterOptimizationMethods.HYPEROPT.value,
                'max_evaluations': 10,
                'performance_metric': HyperOptAlgorithms.TPE.value,
                'algorithm_type': HyperOptLosses.AUC.value
            }),
        create_models=True)[0]

    # load log
    train_log, test_log, additional_columns = get_train_test_log(JOB.split)

    # encode
    train_df, test_df = encode_label_logs(train_log, test_log, JOB)

    # train + evaluate
    results, model_split = MODEL[JOB.predictive_model.predictive_model][
        ModelActions.BUILD_MODEL_AND_TEST.value](train_df, test_df,
                                                 _init_clusterer(
                                                     JOB.clustering, train_df),
                                                 JOB)

    if JOB.create_models:
        save_models(model_split, JOB)

    # predict
    data_df = pd.concat([train_df, test_df])
    results = MODEL[JOB.predictive_model.predictive_model][
        ModelActions.PREDICT.value](JOB, data_df)
    results = MODEL[JOB.predictive_model.predictive_model][
        ModelActions.PREDICT_PROBA.value](JOB, data_df)

    # lime
    exp = Explanation.objects.get_or_create(
        type=ExplanationTypes.LIME.value,
        split=JOB.
        split,  # this defines the analysed log, you can use a different one from the training one
        predictive_model=JOB.predictive_model,
        job=JOB)[0]
    error, result = explanation(exp.id, int(EXPLANATION_TARGET))
Example #5
0
 def post(self, request):
     log = create_log(self.request.FILES['single'], self.request.FILES['single'].name)
     serializer = LogSerializer(log)
     return Response(serializer.data, status=status.HTTP_201_CREATED)
Example #6
0
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    logger.info('\tGetting Dataset')
    if use_cache and \
        (job.predictive_model is not None and
         job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value):
        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            try:
                training_df, test_df = get_labelled_logs(job)
            except FileNotFoundError: #cache invalidation
                LabelledLog.objects.filter(split=job.split,
                                           encoding=job.encoding,
                                           labelling=job.labelling).delete()
                logger.info('\t\tError pre-labeled cache invalidated!')
                return get_encoded_logs(job, use_cache)
        else:
            if job.split.train_log is not None and \
               job.split.test_log is not None and \
               LoadedLog.objects.filter(split=job.split).exists():
                try:
                    training_log, test_log, additional_columns = get_loaded_logs(job.split)
                except FileNotFoundError:  # cache invalidation
                    LoadedLog.objects.filter(split=job.split).delete()
                    logger.info('\t\tError pre-loaded cache invalidated!')
                    return get_encoded_logs(job, use_cache)
            else:
                training_log, test_log, additional_columns = get_train_test_log(job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    job.split = duplicate_orm_row(Split.objects.filter(pk=job.split.pk)[0])
                    job.split.type = SplitTypes.SPLIT_DOUBLE.value
                    train_name = '0-' + str(int(100 - (job.split.test_size * 100)))
                    job.split.train_log = create_log(
                        EventLog(training_log),
                        train_name + '.xes'
                    )
                    test_name = str(int(100 - (job.split.test_size * 100))) + '-100'
                    job.split.test_log = create_log(
                        EventLog(test_log),
                        test_name + '.xes'
                    )
                    job.split.additional_columns = str(train_name + test_name)  # TODO: find better naming policy
                    job.split.save()

                put_loaded_logs(job.split, training_log, test_log, additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = get_train_test_log(job.split)
        training_df, test_df = encode_label_logs(training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df