Example #1
0
def update_and_test(training_df: DataFrame, test_df: DataFrame, job: Job):
    train_data = _drop_columns(training_df)
    test_data = _drop_columns(test_df)

    job.encoding = job.incremental_train.encoding
    job.encoding.save()
    job.save()

    if list(train_data.columns.values
            ) != job.incremental_train.encoding.features:
        # TODO: how do I align the two feature vectors?
        train_data, _ = train_data.align(
            pd.DataFrame(columns=job.incremental_train.encoding.features),
            axis=1,
            join='right')
        train_data = train_data.fillna(0)
        test_data, _ = test_data.align(
            pd.DataFrame(columns=job.incremental_train.encoding.features),
            axis=1,
            join='right')
        test_data = test_data.fillna(0)

    # TODO: UPDATE if incremental, otherwise just test
    model_split = _update(job, train_data)

    results_df, auc = _test(model_split,
                            test_data,
                            evaluation=True,
                            is_binary_classifier=_check_is_binary_classifier(
                                job.labelling.type))

    results = _prepare_results(results_df, auc)

    return results, model_split
Example #2
0
def regression(training_df: DataFrame, test_df: DataFrame,
               clusterer: Clustering, job: Job) -> (dict, dict):
    """main regression entry point

    train and tests the regressor using the provided data

    :param clusterer:
    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: predictive_model scores and split

    """
    train_data, test_data = _prep_data(training_df, test_df)

    job.encoding = duplicate_orm_row(
        Encoding.objects.filter(pk=job.encoding.pk)[0]
    )  # TODO: maybe here would be better an intelligent get_or_create...
    job.encoding.features = list(train_data.columns.values)
    job.encoding.save()
    job.save()

    model_split = _train(train_data, _choose_regressor(job), clusterer)
    results_df = _test(model_split, test_data)

    results = calculate_results_regression(results_df, job.labelling)

    return results, model_split
Example #3
0
def classification(training_df: DataFrame, test_df: DataFrame,
                   clusterer: Clustering, job: Job) -> (dict, dict):
    """main classification entry point

    train and tests the classifier using the provided data

    :param clusterer:
    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: predictive_model scores and split

    """
    train_data = _drop_columns(training_df)
    test_data = _drop_columns(test_df)

    job.encoding = duplicate_orm_row(
        job.encoding
    )  #TODO: maybe here would be better an intelligent get_or_create...
    job.encoding.features = list(train_data.columns.values)
    job.encoding.save()
    job.save()

    model_split = _train(train_data, _choose_classifier(job), clusterer)
    results_df, auc = _test(model_split,
                            test_data,
                            evaluation=True,
                            is_binary_classifier=_check_is_binary_classifier(
                                job.labelling.type))

    results = _prepare_results(results_df, auc)

    return results, model_split
Example #4
0
def cross_validated_regression(training_df: DataFrame,
                               test_df: DataFrame,
                               clusterer: Clustering,
                               job: Job,
                               cv=2) -> (dict, dict):
    """main regression entry point

    train and tests the regressor using the provided data

    :param clusterer:
    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :param cv: cross validation amount
    :return: predictive_model scores and split

    """
    train_data, test_data = _prep_data(training_df, test_df)

    # job.encoding = duplicate_orm_row(Encoding.objects.filter(pk=job.encoding.pk)[0])  # TODO: maybe here would be better an intelligent get_or_create...
    job.encoding = Encoding.objects.create(
        data_encoding=job.encoding.data_encoding,
        value_encoding=job.encoding.value_encoding,
        add_elapsed_time=job.encoding.add_elapsed_time,
        add_remaining_time=job.encoding.add_remaining_time,
        add_executed_events=job.encoding.add_executed_events,
        add_resources_used=job.encoding.add_resources_used,
        add_new_traces=job.encoding.add_new_traces,
        features=job.encoding.features,
        prefix_length=job.encoding.prefix_length,
        padding=job.encoding.padding,
        task_generation_type=job.encoding.task_generation_type)
    job.encoding.features = list(train_data.columns.values)
    job.encoding.save()
    job.save()

    model_split = _train(train_data,
                         _choose_regressor(job),
                         clusterer,
                         do_cv=True)
    results_df = _test(model_split, test_data)

    results = calculate_results_regression(results_df, job.labelling)

    return results, model_split
Example #5
0
def classification(training_df: DataFrame, test_df: DataFrame,
                   clusterer: Clustering, job: Job) -> (dict, dict):
    """main classification entry point

    train and tests the classifier using the provided data

    :param clusterer:
    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: predictive_model scores and split

    """
    train_data = _drop_columns(training_df)
    test_data = _drop_columns(test_df)

    # job.encoding = duplicate_orm_row(Encoding.objects.filter(pk=job.encoding.pk)[0])  # TODO: maybe here would be better an intelligent get_or_create...
    job.encoding = Encoding.objects.create(
        data_encoding=job.encoding.data_encoding,
        value_encoding=job.encoding.value_encoding,
        add_elapsed_time=job.encoding.add_elapsed_time,
        add_remaining_time=job.encoding.add_remaining_time,
        add_executed_events=job.encoding.add_executed_events,
        add_resources_used=job.encoding.add_resources_used,
        add_new_traces=job.encoding.add_new_traces,
        features=job.encoding.features,
        prefix_length=job.encoding.prefix_length,
        padding=job.encoding.padding,
        task_generation_type=job.encoding.task_generation_type)
    job.encoding.features = list(train_data.columns.values)
    job.encoding.save()
    job.save()

    model_split = _train(train_data, _choose_classifier(job), clusterer)
    results_df, auc = _test(model_split,
                            test_data,
                            evaluation=True,
                            is_binary_classifier=_check_is_binary_classifier(
                                job.labelling.type))

    results = _prepare_results(results_df, auc)

    return results, model_split