Example #1
0
def classification(training_df: DataFrame, test_df: DataFrame,
                   clusterer: Clustering, job: Job) -> (dict, dict):
    """main classification entry point

    train and tests the classifier using the provided data

    :param clusterer:
    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: predictive_model scores and split

    """
    train_data = _drop_columns(training_df)
    test_data = _drop_columns(test_df)

    job.encoding = duplicate_orm_row(
        job.encoding
    )  #TODO: maybe here would be better an intelligent get_or_create...
    job.encoding.features = list(train_data.columns.values)
    job.encoding.save()
    job.save()

    model_split = _train(train_data, _choose_classifier(job), clusterer)
    results_df, auc = _test(model_split,
                            test_data,
                            evaluation=True,
                            is_binary_classifier=_check_is_binary_classifier(
                                job.labelling.type))

    results = _prepare_results(results_df, auc)

    return results, model_split
Example #2
0
def update_and_test(training_df: DataFrame, test_df: DataFrame, job: Job):
    train_data = _drop_columns(training_df)
    test_data = _drop_columns(test_df)

    job.encoding = job.incremental_train.encoding
    job.encoding.save()
    job.save()

    if list(train_data.columns.values
            ) != job.incremental_train.encoding.features:
        # TODO: how do I align the two feature vectors?
        train_data, _ = train_data.align(
            pd.DataFrame(columns=job.incremental_train.encoding.features),
            axis=1,
            join='right')
        train_data = train_data.fillna(0)
        test_data, _ = test_data.align(
            pd.DataFrame(columns=job.incremental_train.encoding.features),
            axis=1,
            join='right')
        test_data = test_data.fillna(0)

    # TODO: UPDATE if incremental, otherwise just test
    model_split = _update(job, train_data)

    results_df, auc = _test(model_split,
                            test_data,
                            evaluation=True,
                            is_binary_classifier=_check_is_binary_classifier(
                                job.labelling.type))

    results = _prepare_results(results_df, auc)

    return results, model_split
Example #3
0
def regression(training_df: DataFrame, test_df: DataFrame,
               clusterer: Clustering, job: Job) -> (dict, dict):
    """main regression entry point

    train and tests the regressor using the provided data

    :param clusterer:
    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: predictive_model scores and split

    """
    train_data, test_data = _prep_data(training_df, test_df)

    job.encoding = duplicate_orm_row(
        Encoding.objects.filter(pk=job.encoding.pk)[0]
    )  # TODO: maybe here would be better an intelligent get_or_create...
    job.encoding.features = list(train_data.columns.values)
    job.encoding.save()
    job.save()

    model_split = _train(train_data, _choose_regressor(job), clusterer)
    results_df = _test(model_split, test_data)

    results = calculate_results_regression(results_df, job.labelling)

    return results, model_split
Example #4
0
def set_model_name(job: Job) -> None:
    if job.create_models:
        if job.predictive_model.model_path != '':
            # job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0])  #todo: replace with simple CREATE
            job.predictive_model = PredictiveModel.init(
                job.predictive_model.get_full_dict(
                )  #todo: doublecheck me, are you sure get_full_dict is returning everything needed?
            )  #todo: futurebug if object changes
            job.predictive_model.save()
            job.save()

        if job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value:
            job.clustering.model_path = 'cache/model_cache/job_{}-split_{}-clusterer-{}-v0.sav'.format(
                job.id, job.split.id, job.type)
            job.clustering.save()

        if job.type == JobTypes.UPDATE.value:
            job.type = JobTypes.PREDICTION.value  #TODO: Y am I doing this?
            predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v{}.sav'.format(
                job.id, job.split.id, job.type, str(time.time()))
        else:
            predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v0.sav'.format(
                job.id, job.split.id, job.type)
        job.predictive_model.model_path = predictive_model_filename
        job.predictive_model.save()
        job.save()
Example #5
0
def set_model_name(job: Job) -> None:
    if job.create_models:
        if job.predictive_model.model_path != '':
            job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0])
            job.predictive_model.save()
            job.save()

        if job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value:
            job.clustering.model_path = 'cache/model_cache/job_{}-split_{}-clusterer-{}-v0.sav'.format(
                job.id,
                job.split.id,
                job.type)
            job.clustering.save()

        if job.type == JobTypes.UPDATE.value:
            job.type = JobTypes.PREDICTION.value #TODO: Y am I doing this?
            predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v{}.sav'.format(
                job.id,
                job.split.id,
                job.type,
                str(time.time()))
        else:
            predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v0.sav'.format(
                job.id,
                job.split.id,
                job.type)
        job.predictive_model.model_path = predictive_model_filename
        job.predictive_model.save()
        job.save()
Example #6
0
def replay_task(replay_job: Job, training_initial_job: Job) -> list:
    """ The function create a replay task to ask the server to demo the arriving of events

        :param replay_job: job dictionary
        :param training_initial_job: job dictionary
        :return: List of requests
    """
    logger.error("Start replay task ID {}".format(replay_job.id))
    requests = list()
    try:
        replay_job.status = JobStatuses.RUNNING.value
        replay_job.error = ''
        replay_job.save()
        requests = replay_core(replay_job, training_initial_job)
        replay_job.status = JobStatuses.COMPLETED.value
        for r in requests:
            if r.status_code != status.HTTP_201_CREATED:
                replay_job.error += [r]
    except Exception as e:
        logger.error(e)
        replay_job.status = JobStatuses.ERROR.value
        replay_job.error += [str(e.__repr__())]
        raise e
    finally:
        replay_job.save()
        publish(replay_job)
        return requests
Example #7
0
def get_encoded_logs(job: Job,
                     use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    print('\tGetting Dataset')
    if use_cache:
        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            training_df, test_df = get_labelled_logs(job)

        else:
            if job.split.train_log is not None and \
                job.split.test_log is not None and \
                LoadedLog.objects.filter(train_log=job.split.train_log.path,
                                         test_log=job.split.test_log.path).exists():
                training_log, test_log, additional_columns = get_loaded_logs(
                    job.split)

            else:
                training_log, test_log, additional_columns = prepare_logs(
                    job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    job.split = duplicate_orm_row(job.split)
                    job.split.type = SplitTypes.SPLIT_DOUBLE.value
                    train_name = '0-' + str(
                        int(100 - (job.split.test_size * 100)))
                    job.split.train_log = create_log(EventLog(training_log),
                                                     train_name + '.xes')
                    test_name = str(int(100 -
                                        (job.split.test_size * 100))) + '-100'
                    job.split.test_log = create_log(EventLog(test_log),
                                                    test_name + '.xes')
                    job.split.additional_columns = str(
                        train_name +
                        test_name)  # TODO: find better naming policy
                    job.save()

                put_loaded_logs(job.split, training_log, test_log,
                                additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = prepare_logs(job.split)
        training_df, test_df = encode_label_logs(
            training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df
Example #8
0
def calculate_hyperopt(job: Job) -> (dict, dict, dict):
    """main entry method for hyperopt calculations
    returns the predictive_model for the best trial

    :param job: job configuration
    :return: tuple containing the results, config and predictive_model split from the search
    """

    logger.info("Start hyperopt job {} with {}, performance_metric {}".format(
        job.type, get_run(job),
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()).
        performance_metric)  #Todo: WHY DO I NEED TO GET HYPEROPT?
                )

    global training_df, test_df, global_job
    global_job = job
    training_df, test_df = get_encoded_logs(job)

    space = _get_space(job)

    max_evaluations = job.hyperparameter_optimizer.__getattribute__(
        job.hyperparameter_optimizer.optimization_method.lower(
        )).max_evaluations  #Todo: WHY DO I NEED TO GET HYPEROPT?
    trials = Trials()

    algorithm = _choose_algorithm(job)

    try:
        fmin(_calculate_and_evaluate,
             space,
             algo=algorithm.suggest,
             max_evals=max_evaluations,
             trials=trials)
    except ValueError:
        raise ValueError("All jobs failed, cannot find best configuration")
    current_best = {
        'loss': 100,
        'results': {},
        'predictive_model_id': {},
        'model_split': {},
        'config': {}
    }
    for trial in trials:
        a = trial['result']
        if current_best['loss'] > a['loss']:
            current_best = a

    job.predictive_model = PredictiveModel.objects.filter(
        pk=current_best['predictive_model_id'])[0]
    job.save()

    logger.info("End hyperopt job {}, {} . Results {}".format(
        job.type, get_run(job), current_best['results']))
    return current_best['results'], current_best['config'], current_best[
        'model_split']
Example #9
0
def run_by_type(training_df: DataFrame, test_df: DataFrame,
                job: Job) -> (dict, dict):
    """runs the specified training/evaluation run

    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: results and predictive_model split

    """
    model_split = None

    # TODO fixme this needs to be fixed in the interface
    # if job['incremental_train']['base_model'] is not None:
    #     job['type'] = JobTypes.UPDATE.value

    start_time = time.time()
    if job.type == JobTypes.PREDICTION.value:
        clusterer = _init_clusterer(job.clustering, training_df)
        if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value:
            results, model_split = classification(training_df, test_df,
                                                  clusterer, job)
        elif job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value:
            results, model_split = regression(training_df, test_df, clusterer,
                                              job)
        elif job.predictive_model.predictive_model == PredictiveModels.TIME_SERIES_PREDICTION.value:
            results, model_split = time_series_prediction(
                training_df, test_df, clusterer, job)
    elif job.type == JobTypes.LABELLING.value:
        results = _label_task(training_df)
    elif job.type == JobTypes.UPDATE.value:
        results, model_split = update_and_test(training_df, test_df, job)
    else:
        raise ValueError("Type {} not supported".format(job.type))

    # TODO: integrateme
    if job.type != JobTypes.LABELLING.value:
        if job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value:
            job.evaluation = Evaluation.init(
                job.predictive_model.predictive_model, results)
        elif job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value:
            job.evaluation = Evaluation.init(
                job.predictive_model.predictive_model, results,
                len(model_split[job.predictive_model.predictive_model]
                    [0].classes_) <= 2)
        job.save()

    if job.type == PredictiveModels.CLASSIFICATION.value:
        save_result(results, job, start_time)

    print("End job {}, {} .".format(job.type, get_run(job)))
    print("\tResults {} .".format(results))
    return results, model_split
Example #10
0
def cross_validated_regression(training_df: DataFrame,
                               test_df: DataFrame,
                               clusterer: Clustering,
                               job: Job,
                               cv=2) -> (dict, dict):
    """main regression entry point

    train and tests the regressor using the provided data

    :param clusterer:
    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :param cv: cross validation amount
    :return: predictive_model scores and split

    """
    train_data, test_data = _prep_data(training_df, test_df)

    # job.encoding = duplicate_orm_row(Encoding.objects.filter(pk=job.encoding.pk)[0])  # TODO: maybe here would be better an intelligent get_or_create...
    job.encoding = Encoding.objects.create(
        data_encoding=job.encoding.data_encoding,
        value_encoding=job.encoding.value_encoding,
        add_elapsed_time=job.encoding.add_elapsed_time,
        add_remaining_time=job.encoding.add_remaining_time,
        add_executed_events=job.encoding.add_executed_events,
        add_resources_used=job.encoding.add_resources_used,
        add_new_traces=job.encoding.add_new_traces,
        features=job.encoding.features,
        prefix_length=job.encoding.prefix_length,
        padding=job.encoding.padding,
        task_generation_type=job.encoding.task_generation_type)
    job.encoding.features = list(train_data.columns.values)
    job.encoding.save()
    job.save()

    model_split = _train(train_data,
                         _choose_regressor(job),
                         clusterer,
                         do_cv=True)
    results_df = _test(model_split, test_data)

    results = calculate_results_regression(results_df, job.labelling)

    return results, model_split
Example #11
0
def replay_prediction(replay_job: Job, training_initial_job: Job,
                      trace_id) -> list:
    """The function create a set with timestamps of events, then create a list of requests
        simulating the log in the time passing
        :param trace_id:
        :param replay_job: job dictionary
        :param training_initial_job: job dictionary
        :return: List of requests
    """

    split = replay_job.split
    log = get_log(split.train_log)
    requests_list = list()
    eventlog = EventLog()
    trace = log[int(trace_id)]
    for key in log.attributes.keys():
        eventlog.attributes[key] = log.attributes[key]
    for index in range(len(trace)):
        new_trace = Trace(trace[0:index])
        for key in trace.attributes:
            new_trace.attributes[key] = trace.attributes[key]
        eventlog.append(new_trace)
    replay_job.case_id = trace_id
    replay_job.event_number = len(trace)
    replay_job.save()
    try:
        logger.error("Sending request for replay_prediction task.")
        r = requests.post(
            url="http://127.0.0.1:8000/runtime/replay_prediction/",
            data=export_log_as_string(eventlog),
            params={
                'jobId': replay_job.id,
                'training_job': training_initial_job.id
            },
            headers={
                'Content-Type': 'text/plain',
                'charset': 'UTF-8'
            })
        requests_list.append(str(r))
    except Exception as e:
        requests_list.append(str(e))
        logger.warning(str(e))

    return requests_list
Example #12
0
def classification(training_df: DataFrame, test_df: DataFrame,
                   clusterer: Clustering, job: Job) -> (dict, dict):
    """main classification entry point

    train and tests the classifier using the provided data

    :param clusterer:
    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: predictive_model scores and split

    """
    train_data = _drop_columns(training_df)
    test_data = _drop_columns(test_df)

    # job.encoding = duplicate_orm_row(Encoding.objects.filter(pk=job.encoding.pk)[0])  # TODO: maybe here would be better an intelligent get_or_create...
    job.encoding = Encoding.objects.create(
        data_encoding=job.encoding.data_encoding,
        value_encoding=job.encoding.value_encoding,
        add_elapsed_time=job.encoding.add_elapsed_time,
        add_remaining_time=job.encoding.add_remaining_time,
        add_executed_events=job.encoding.add_executed_events,
        add_resources_used=job.encoding.add_resources_used,
        add_new_traces=job.encoding.add_new_traces,
        features=job.encoding.features,
        prefix_length=job.encoding.prefix_length,
        padding=job.encoding.padding,
        task_generation_type=job.encoding.task_generation_type)
    job.encoding.features = list(train_data.columns.values)
    job.encoding.save()
    job.save()

    model_split = _train(train_data, _choose_classifier(job), clusterer)
    results_df, auc = _test(model_split,
                            test_data,
                            evaluation=True,
                            is_binary_classifier=_check_is_binary_classifier(
                                job.labelling.type))

    results = _prepare_results(results_df, auc)

    return results, model_split
Example #13
0
def replay_prediction_task(replay_prediction_job: Job, training_initial_job: Job, log: Log):
    """ The function create a replat prediction task to ask a single prediction to the server for a portion of a trace

        :param replay_prediction_job: job dictionary
        :param training_initial_job: job dictionary
        :param log: job dictionary
    """
    logger.info("Start replay_prediction task ID {}".format(replay_prediction_job.id))
    try:
        replay_prediction_job.status = JobStatuses.RUNNING.value
        replay_prediction_job.save()
        max_len = max(len(trace) for trace in log)
        if replay_prediction_job.encoding.prefix_length != max_len:
            prediction_job = create_prediction_job(training_initial_job, max_len)
            prediction_task(prediction_job.id)
            prediction_job.refresh_from_db()
            new_replay_prediction_job = duplicate_orm_row(prediction_job)
            new_replay_prediction_job.split = Split.objects.filter(pk=replay_prediction_job.split.id)[0]
            new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value
            new_replay_prediction_job.parent_job = replay_prediction_job.parent_job
            new_replay_prediction_job.status = JobStatuses.CREATED.value
            replay_prediction_task(new_replay_prediction_job, prediction_job, log)
            return
        result_dict, events_for_trace = replay_prediction_calculate(replay_prediction_job, log)
        replay_prediction_job.results = dict(result_dict)
        replay_prediction_job.event_number = dict(events_for_trace)
        replay_prediction_job.status = JobStatuses.COMPLETED.value
        replay_prediction_job.error = ''
    except Exception as e:
        logger.error(e)
        replay_prediction_job.status = JobStatuses.ERROR.value
        replay_prediction_job.error = str(e.__repr__())
        raise e
    finally:
        replay_prediction_job.save()
        publish(replay_prediction_job)
Example #14
0
def get_encoded_logs(job: Job,
                     use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    logger.info('\tGetting Dataset')

    if use_cache and \
        (job.predictive_model is not None and
         job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value):

        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            try:
                training_df, test_df = get_labelled_logs(job)
            except FileNotFoundError:  #cache invalidation
                LabelledLog.objects.filter(split=job.split,
                                           encoding=job.encoding,
                                           labelling=job.labelling).delete()
                logger.info('\t\tError pre-labeled cache invalidated!')
                return get_encoded_logs(job, use_cache)
        else:
            if job.split.train_log is not None and \
               job.split.test_log is not None and \
               LoadedLog.objects.filter(split=job.split).exists():
                try:
                    training_log, test_log, additional_columns = get_loaded_logs(
                        job.split)
                except FileNotFoundError:  # cache invalidation
                    LoadedLog.objects.filter(split=job.split).delete()
                    logger.info('\t\tError pre-loaded cache invalidated!')
                    return get_encoded_logs(job, use_cache)
            else:
                training_log, test_log, additional_columns = get_train_test_log(
                    job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    search_for_already_existing_split = Split.objects.filter(
                        type=SplitTypes.SPLIT_DOUBLE.value,
                        original_log=job.split.original_log,
                        test_size=job.split.test_size,
                        splitting_method=job.split.splitting_method)
                    if len(search_for_already_existing_split) >= 1:
                        job.split = search_for_already_existing_split[0]
                        job.split.save()
                        job.save()
                        return get_encoded_logs(job, use_cache=use_cache)
                    else:
                        # job.split = duplicate_orm_row(Split.objects.filter(pk=job.split.pk)[0])  #todo: replace with simple CREATE
                        job.split = Split.objects.create(
                            type=job.split.type,
                            original_log=job.split.original_log,
                            test_size=job.split.test_size,
                            splitting_method=job.split.splitting_method,
                            train_log=job.split.train_log,
                            test_log=job.split.test_log,
                            additional_columns=job.split.additional_columns
                        )  #todo: futurebug if object changes
                        job.split.type = SplitTypes.SPLIT_DOUBLE.value
                        train_name = 'SPLITTED_' + job.split.original_log.name.split(
                            '.')[0] + '_0-' + str(
                                int(100 - (job.split.test_size * 100)))
                        job.split.train_log = create_log(
                            EventLog(training_log), train_name + '.xes')
                        test_name = 'SPLITTED_' + job.split.original_log.name.split(
                            '.')[0] + '_' + str(
                                int(100 -
                                    (job.split.test_size * 100))) + '-100'
                        job.split.test_log = create_log(
                            EventLog(test_log), test_name + '.xes')
                        job.split.additional_columns = str(
                            train_name +
                            test_name)  # TODO: find better naming policy
                        job.split.save()

                put_loaded_logs(job.split, training_log, test_log,
                                additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = get_train_test_log(
            job.split)
        training_df, test_df = encode_label_logs(
            training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df
def calculate_hyperopt(job: Job) -> (dict, dict, dict):
    """main entry method for hyperopt calculations
    returns the predictive_model for the best trial

    :param job: job configuration
    :return: tuple containing the results, config and predictive_model split from the search
    """

    logger.info("Start hyperopt job {} with {}, performance_metric {}".format(
        job.type, get_run(job),
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()).
        performance_metric)  #Todo: WHY DO I NEED TO GET HYPEROPT?
                )

    global train_df, validation_df, test_df, global_job
    global_job = job
    train_df, test_df = get_encoded_logs(job)
    train_df, validation_df, test_df = _retrieve_train_validate_test(
        train_df, test_df)

    train_start_time = time.time()

    space = _get_space(job)

    max_evaluations = job.hyperparameter_optimizer.__getattribute__(
        job.hyperparameter_optimizer.optimization_method.lower(
        )).max_evaluations  #Todo: WHY DO I NEED TO GET HYPEROPT?
    trials = Trials()

    algorithm = algorithm = OPTIMISATION_ALGORITHM[
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower(
            )).algorithm_type]
    _run_hyperoptimisation(space, algorithm.suggest, max_evaluations, trials)

    best_candidate = trials.best_trial['result']

    job.predictive_model = PredictiveModel.objects.filter(
        pk=best_candidate['predictive_model_id'])[0]
    job.predictive_model.save()
    job.save()

    best_candidate['results']['elapsed_time'] = timedelta(
        seconds=time.time() -
        train_start_time)  # todo find better place for this
    job.evaluation.elapsed_time = best_candidate['results']['elapsed_time']
    job.evaluation.save()

    results_df, auc = _test_best_candidate(
        best_candidate, job.labelling.type,
        job.predictive_model.predictive_model)
    if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value:
        results = classification_prepare_results(results_df, auc)
    else:
        results = regression_prepare_results(results_df, job.labelling)
    results['elapsed_time'] = job.evaluation.elapsed_time
    job.evaluation = Evaluation.init(job.predictive_model.predictive_model,
                                     results,
                                     len(set(validation_df['label'])) <= 2)
    job.evaluation.save()
    job.save()

    logger.info(
        "End hyperopt job {}, {}. \n\tResults on validation {}. \n\tResults on test {}."
        .format(job.type, get_run(job), best_candidate['results'], results))  #
    return results, best_candidate['config'], best_candidate['model_split']
Example #16
0
def run_by_type(training_df: DataFrame, test_df: DataFrame, job: Job) -> (dict, dict):
    """runs the specified training/evaluation run

    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: results and predictive_model split

    """
    model_split = None

    start_time = time.time()
    if job.type == JobTypes.PREDICTION.value:
        clusterer = init_clusterer(job.clustering, training_df)
        results, model_split = MODEL[job.predictive_model.predictive_model][ModelActions.BUILD_MODEL_AND_TEST.value](training_df, test_df, clusterer, job)
    elif job.type == JobTypes.LABELLING.value:
        results = _label_task(training_df)
    elif job.type == JobTypes.UPDATE.value:
        results, model_split = MODEL[job.predictive_model.predictive_model][ModelActions.UPDATE_AND_TEST.value](training_df, test_df, job)
    else:
        raise ValueError("Type {} not supported".format(job.type))

    # TODO: integrateme
    if job.type != JobTypes.LABELLING.value:
        results['elapsed_time'] = timedelta(seconds=time.time() - start_time) #todo find better place for this
        if job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value:
            job.evaluation = Evaluation.init(
                job.predictive_model.predictive_model,
                results
            )
        elif job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value:
            job.evaluation = Evaluation.init(
                job.predictive_model.predictive_model,
                results,
                len(set(test_df['label'])) <= 2
            )
        elif job.predictive_model.predictive_model == PredictiveModels.TIME_SERIES_PREDICTION.value:
            job.evaluation = Evaluation.init(
                job.predictive_model.predictive_model,
                results
            )
        job.evaluation.save()
        job.save()
    elif job.type == JobTypes.LABELLING.value:
        # job.labelling = duplicate_orm_row(job.labelling) #todo: replace with simple CREATE
        job.labelling = Labelling.objects.create(
            type=job.labelling.type,
            attribute_name=job.labelling.attribute_name,
            threshold_type=job.labelling.threshold_type,
            threshold=job.labelling.threshold
        ) #todo: futurebug if object changes
        job.labelling.results = results
        job.labelling.save()
        job.save()

    # if job.type == PredictiveModels.CLASSIFICATION.value: #todo this is an old workaround I should remove this
    #     save_result(results, job, start_time)

    logger.info("End job {}, {} .".format(job.type, get_run(job)))
    logger.info("\tResults {} .".format(results))
    return results, model_split
Example #17
0
def replay_prediction_task(replay_prediction_job: Job,
                           training_initial_job: Job, log: Log):
    """ The function create a replat prediction task to ask a single prediction to the server for a portion of a trace

        :param replay_prediction_job: job dictionary
        :param training_initial_job: job dictionary
        :param log: job dictionary
    """
    logger.info("Start replay_prediction task ID {}".format(
        replay_prediction_job.id))
    try:
        replay_prediction_job.status = JobStatuses.RUNNING.value
        replay_prediction_job.save()
        max_len = max(len(trace) for trace in log)
        if replay_prediction_job.encoding.prefix_length != max_len:
            prediction_job = create_prediction_job(training_initial_job,
                                                   max_len)
            prediction_task(prediction_job.id)
            prediction_job.refresh_from_db()
            # new_replay_prediction_job = duplicate_orm_row(prediction_job)  #todo: replace with simple CREATE
            new_replay_prediction_job = Job.objects.create(
                created_date=prediction_job.created_date,
                modified_date=prediction_job.modified_date,
                error=prediction_job.error,
                status=prediction_job.status,
                type=prediction_job.type,
                create_models=prediction_job.create_models,
                case_id=prediction_job.case_id,
                event_number=prediction_job.event_number,
                gold_value=prediction_job.gold_value,
                results=prediction_job.results,
                parent_job=prediction_job.parent_job,
                split=prediction_job.split,
                encoding=prediction_job.encoding,
                labelling=prediction_job.labelling,
                clustering=prediction_job.clustering,
                predictive_model=prediction_job.predictive_model,
                evaluation=prediction_job.evaluation,
                hyperparameter_optimizer=prediction_job.
                hyperparameter_optimizer,
                incremental_train=prediction_job.incremental_train)
            new_replay_prediction_job.split = Split.objects.filter(
                pk=replay_prediction_job.split.id)[0]
            new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value
            new_replay_prediction_job.parent_job = replay_prediction_job.parent_job
            new_replay_prediction_job.status = JobStatuses.CREATED.value
            replay_prediction_task(new_replay_prediction_job, prediction_job,
                                   log)
            return
        result_dict, events_for_trace = replay_prediction_calculate(
            replay_prediction_job, log)
        replay_prediction_job.results = dict(result_dict)
        replay_prediction_job.event_number = dict(events_for_trace)
        replay_prediction_job.status = JobStatuses.COMPLETED.value
        replay_prediction_job.error = ''
    except Exception as e:
        logger.error(e)
        replay_prediction_job.status = JobStatuses.ERROR.value
        replay_prediction_job.error = str(e.__repr__())
        raise e
    finally:
        replay_prediction_job.save()
        publish(replay_prediction_job)
def calculate_hyperopt(job: Job) -> (dict, dict, dict):
    """main entry method for hyperopt calculations
    returns the predictive_model for the best trial

    :param job: job configuration
    :return: tuple containing the results, config and predictive_model split from the search
    """

    logger.info("Start hyperopt job {} with {}, performance_metric {}".format(
        job.type, get_run(job),
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()
        ).performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT?
    )

    global training_df, test_df, global_job
    global_job = job
    training_df, test_df = get_encoded_logs(job)
    #TODO evaluate on validation set
    if holdout:
        validation_df = test_df
        # test_df = training_df.sample(frac=.2)
        test_df = training_df.tail(int(len(training_df) * 20 / 100))
        training_df = training_df.drop(test_df.index)

    train_start_time = time.time()

    space = _get_space(job)

    max_evaluations = job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()
        ).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT?
    trials = Trials()

    algorithm = _choose_algorithm(job)

    try:
        fmin(_calculate_and_evaluate, space, algo=algorithm.suggest, max_evals=max_evaluations, trials=trials)
    except ValueError:
        raise ValueError("All jobs failed, cannot find best configuration")
    current_best = {'loss': 100, 'results': {}, 'predictive_model_id': {}, 'model_split': {}, 'config': {}}
    for trial in trials:
        a = trial['result']
        if current_best['loss'] > a['loss']:
            current_best = a

    job.predictive_model = PredictiveModel.objects.filter(pk=current_best['predictive_model_id'])[0]
    job.predictive_model.save()
    job.save()

    current_best['results']['elapsed_time'] = timedelta(seconds=time.time() - train_start_time)  # todo find better place for this
    job.evaluation.elapsed_time = current_best['results']['elapsed_time']
    job.evaluation.save()

    #TODO evaluate on validation set
    if holdout:
        results_df, auc = _test(
            current_best['model_split'],
            validation_df.drop(['trace_id'], 1),
            evaluation=True,
            is_binary_classifier=_check_is_binary_classifier(job.labelling.type)
        )
        results = _prepare_results(results_df, auc)
        results['elapsed_time'] = job.evaluation.elapsed_time
        job.evaluation = Evaluation.init(
            job.predictive_model.predictive_model,
            results,
            len(set(test_df['label'])) <= 2
        )
        job.evaluation.save()
        job.save()

    if holdout:
        logger.info("End hyperopt job {}, {}. \n\tResults on test {}. \n\tResults on validation {}.".format(job.type, get_run(job), current_best['results'], results))
        return results, current_best['config'], current_best['model_split']
    else:
        logger.info("End hyperopt job {}, {}. \n\tResults on test {}.".format(job.type, get_run(job), current_best['results']))
        return current_best['results'], current_best['config'], current_best['model_split']
Example #19
0
def replay_core(replay_job: Job, training_initial_job: Job) -> list:
    """The function create a set with timestamps of events, then create a list of requests
        simulating the log in the time passing

        :param replay_job: job dictionary
        :param training_initial_job: job dictionary
        :return: List of requests
    """

    split = replay_job.split
    log = get_log(split.train_log)
    requests_list = list()

    eventlog = EventLog()
    for key in log.attributes.keys():
        eventlog.attributes[key] = log.attributes[key]
    for trace in log:
        new_trace = Trace(trace)
        for key in trace.attributes:
            new_trace.attributes[key] = trace.attributes[key]
        eventlog.append(new_trace)

    times = sorted(
        set([event['time:timestamp'] for trace in eventlog
             for event in trace]))

    for t in times[2::int((len(times) - 2) / 5)]:
        filtered_eventlog = timestamp_filter.apply_events(
            eventlog, times[0].replace(tzinfo=None), t.replace(tzinfo=None))
        trace_list = list()
        event_number = dict()
        for trace in filtered_eventlog:
            trace_list.append(trace.attributes['concept:name'])
            event_number[trace.attributes['concept:name']] = len(trace)
        replay_job.case_id = trace_list
        replay_job.event_number = event_number
        replay_job.save()
        try:  #TODO check logger usage
            logger.info("Sending request for replay_prediction task.")
            r = requests.post(
                url="http://server:8000/runtime/replay_prediction/",
                data=export_log_as_string(filtered_eventlog),
                params={
                    'jobId': replay_job.id,
                    'training_job': training_initial_job.id
                },
                headers={
                    'Content-Type': 'text/plain',
                    'charset': 'UTF-8'
                })
            requests_list.append(str(r))
        except Exception as e:
            requests_list.append(str(e))
            logger.warning(str(e))

    training_log, test_log, additional_columns = get_train_test_log(
        replay_job.split)
    training_df, _ = encode_label_logs(training_log,
                                       test_log,
                                       replay_job,
                                       additional_columns=additional_columns)

    gold_values = dict(zip(training_df['trace_id'], training_df['label']))
    parent_id = replay_job.id
    # final_job = duplicate_orm_row(replay_job)  #todo: replace with simple CREATE
    final_job = Job.objects.create(
        created_date=replay_job.created_date,
        modified_date=replay_job.modified_date,
        error=replay_job.error,
        status=replay_job.status,
        type=replay_job.type,
        create_models=replay_job.create_models,
        case_id=replay_job.case_id,
        event_number=replay_job.event_number,
        gold_value=replay_job.gold_value,
        results=replay_job.results,
        parent_job=replay_job.parent_job,
        split=replay_job.split,
        encoding=replay_job.encoding,
        labelling=replay_job.labelling,
        clustering=replay_job.clustering,
        predictive_model=replay_job.predictive_model,
        evaluation=replay_job.evaluation,
        hyperparameter_optimizer=replay_job.hyperparameter_optimizer,
        incremental_train=replay_job.incremental_train)
    final_job.parent_job = Job.objects.filter(pk=parent_id)[0]
    final_job.gold_value = gold_values
    final_job.type = JobTypes.REPLAY_PREDICT.value
    final_job.save()
    return requests_list
Example #20
0
def check_predictive_model_not_overwrite(job: Job) -> None:
    if job.hyperparameter_optimizer.optimization_method != HyperparameterOptimizationMethods.NONE.value:
        job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0])
        job.predictive_model.save()
        job.save()