Beispiel #1
0
def run_by_type(training_df: DataFrame, test_df: DataFrame,
                job: Job) -> (dict, dict):
    """runs the specified training/evaluation run

    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: results and predictive_model split

    """
    model_split = None

    start_time = time.time()
    if job.type == JobTypes.PREDICTION.value:
        clusterer = _init_clusterer(job.clustering, training_df)
        if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value:
            results, model_split = classification(training_df, test_df,
                                                  clusterer, job)
        elif job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value:
            results, model_split = regression(training_df, test_df, clusterer,
                                              job)
        elif job.predictive_model.predictive_model == PredictiveModels.TIME_SERIES_PREDICTION.value:
            results, model_split = time_series_prediction(
                training_df, test_df, clusterer, job)
    elif job.type == JobTypes.LABELLING.value:
        results = _label_task(training_df)
    elif job.type == JobTypes.UPDATE.value:
        results, model_split = update_and_test(training_df, test_df, job)
    else:
        raise ValueError("Type {} not supported".format(job.type))

    # TODO: integrateme
    if job.type != JobTypes.LABELLING.value:
        results['elapsed_time'] = timedelta(
            seconds=time.time() - start_time)  #todo find better place for this
        if job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value:
            job.evaluation = Evaluation.init(
                job.predictive_model.predictive_model, results)
        elif job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value:
            job.evaluation = Evaluation.init(
                job.predictive_model.predictive_model, results,
                len(model_split[ModelType.CLASSIFIER.value][0].classes_) <= 2)
        job.evaluation.save()
    elif job.type == JobTypes.LABELLING.value:
        job.labelling.results = results
        job.labelling.save()

    if job.type == PredictiveModels.CLASSIFICATION.value:  #todo this is an old workaround I should remove this
        save_result(results, job, start_time)

    print("End job {}, {} .".format(job.type, get_run(job)))
    print("\tResults {} .".format(results))
    return results, model_split
Beispiel #2
0
def run_by_type(training_df: DataFrame, test_df: DataFrame,
                job: Job) -> (dict, dict):
    """runs the specified training/evaluation run

    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: results and predictive_model split

    """
    model_split = None

    # TODO fixme this needs to be fixed in the interface
    # if job['incremental_train']['base_model'] is not None:
    #     job['type'] = JobTypes.UPDATE.value

    start_time = time.time()
    if job.type == JobTypes.PREDICTION.value:
        clusterer = _init_clusterer(job.clustering, training_df)
        if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value:
            results, model_split = classification(training_df, test_df,
                                                  clusterer, job)
        elif job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value:
            results, model_split = regression(training_df, test_df, clusterer,
                                              job)
        elif job.predictive_model.predictive_model == PredictiveModels.TIME_SERIES_PREDICTION.value:
            results, model_split = time_series_prediction(
                training_df, test_df, clusterer, job)
    elif job.type == JobTypes.LABELLING.value:
        results = _label_task(training_df)
    elif job.type == JobTypes.UPDATE.value:
        results, model_split = update_and_test(training_df, test_df, job)
    else:
        raise ValueError("Type {} not supported".format(job.type))

    # TODO: integrateme
    if job.type != JobTypes.LABELLING.value:
        if job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value:
            job.evaluation = Evaluation.init(
                job.predictive_model.predictive_model, results)
        elif job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value:
            job.evaluation = Evaluation.init(
                job.predictive_model.predictive_model, results,
                len(model_split[job.predictive_model.predictive_model]
                    [0].classes_) <= 2)
        job.save()

    if job.type == PredictiveModels.CLASSIFICATION.value:
        save_result(results, job, start_time)

    print("End job {}, {} .".format(job.type, get_run(job)))
    print("\tResults {} .".format(results))
    return results, model_split
def calculate_hyperopt(job: Job) -> (dict, dict, dict):
    """main entry method for hyperopt calculations
    returns the predictive_model for the best trial

    :param job: job configuration
    :return: tuple containing the results, config and predictive_model split from the search
    """

    logger.info("Start hyperopt job {} with {}, performance_metric {}".format(
        job.type, get_run(job),
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()).
        performance_metric)  #Todo: WHY DO I NEED TO GET HYPEROPT?
                )

    global train_df, validation_df, test_df, global_job
    global_job = job
    train_df, test_df = get_encoded_logs(job)
    train_df, validation_df, test_df = _retrieve_train_validate_test(
        train_df, test_df)

    train_start_time = time.time()

    space = _get_space(job)

    max_evaluations = job.hyperparameter_optimizer.__getattribute__(
        job.hyperparameter_optimizer.optimization_method.lower(
        )).max_evaluations  #Todo: WHY DO I NEED TO GET HYPEROPT?
    trials = Trials()

    algorithm = algorithm = OPTIMISATION_ALGORITHM[
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower(
            )).algorithm_type]
    _run_hyperoptimisation(space, algorithm.suggest, max_evaluations, trials)

    best_candidate = trials.best_trial['result']

    job.predictive_model = PredictiveModel.objects.filter(
        pk=best_candidate['predictive_model_id'])[0]
    job.predictive_model.save()
    job.save()

    best_candidate['results']['elapsed_time'] = timedelta(
        seconds=time.time() -
        train_start_time)  # todo find better place for this
    job.evaluation.elapsed_time = best_candidate['results']['elapsed_time']
    job.evaluation.save()

    results_df, auc = _test_best_candidate(
        best_candidate, job.labelling.type,
        job.predictive_model.predictive_model)
    if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value:
        results = classification_prepare_results(results_df, auc)
    else:
        results = regression_prepare_results(results_df, job.labelling)
    results['elapsed_time'] = job.evaluation.elapsed_time
    job.evaluation = Evaluation.init(job.predictive_model.predictive_model,
                                     results,
                                     len(set(validation_df['label'])) <= 2)
    job.evaluation.save()
    job.save()

    logger.info(
        "End hyperopt job {}, {}. \n\tResults on validation {}. \n\tResults on test {}."
        .format(job.type, get_run(job), best_candidate['results'], results))  #
    return results, best_candidate['config'], best_candidate['model_split']
Beispiel #4
0
def run_by_type(training_df: DataFrame, test_df: DataFrame, job: Job) -> (dict, dict):
    """runs the specified training/evaluation run

    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: results and predictive_model split

    """
    model_split = None

    start_time = time.time()
    if job.type == JobTypes.PREDICTION.value:
        clusterer = init_clusterer(job.clustering, training_df)
        results, model_split = MODEL[job.predictive_model.predictive_model][ModelActions.BUILD_MODEL_AND_TEST.value](training_df, test_df, clusterer, job)
    elif job.type == JobTypes.LABELLING.value:
        results = _label_task(training_df)
    elif job.type == JobTypes.UPDATE.value:
        results, model_split = MODEL[job.predictive_model.predictive_model][ModelActions.UPDATE_AND_TEST.value](training_df, test_df, job)
    else:
        raise ValueError("Type {} not supported".format(job.type))

    # TODO: integrateme
    if job.type != JobTypes.LABELLING.value:
        results['elapsed_time'] = timedelta(seconds=time.time() - start_time) #todo find better place for this
        if job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value:
            job.evaluation = Evaluation.init(
                job.predictive_model.predictive_model,
                results
            )
        elif job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value:
            job.evaluation = Evaluation.init(
                job.predictive_model.predictive_model,
                results,
                len(set(test_df['label'])) <= 2
            )
        elif job.predictive_model.predictive_model == PredictiveModels.TIME_SERIES_PREDICTION.value:
            job.evaluation = Evaluation.init(
                job.predictive_model.predictive_model,
                results
            )
        job.evaluation.save()
        job.save()
    elif job.type == JobTypes.LABELLING.value:
        # job.labelling = duplicate_orm_row(job.labelling) #todo: replace with simple CREATE
        job.labelling = Labelling.objects.create(
            type=job.labelling.type,
            attribute_name=job.labelling.attribute_name,
            threshold_type=job.labelling.threshold_type,
            threshold=job.labelling.threshold
        ) #todo: futurebug if object changes
        job.labelling.results = results
        job.labelling.save()
        job.save()

    # if job.type == PredictiveModels.CLASSIFICATION.value: #todo this is an old workaround I should remove this
    #     save_result(results, job, start_time)

    logger.info("End job {}, {} .".format(job.type, get_run(job)))
    logger.info("\tResults {} .".format(results))
    return results, model_split
def calculate_hyperopt(job: Job) -> (dict, dict, dict):
    """main entry method for hyperopt calculations
    returns the predictive_model for the best trial

    :param job: job configuration
    :return: tuple containing the results, config and predictive_model split from the search
    """

    logger.info("Start hyperopt job {} with {}, performance_metric {}".format(
        job.type, get_run(job),
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()
        ).performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT?
    )

    global training_df, test_df, global_job
    global_job = job
    training_df, test_df = get_encoded_logs(job)
    #TODO evaluate on validation set
    if holdout:
        validation_df = test_df
        # test_df = training_df.sample(frac=.2)
        test_df = training_df.tail(int(len(training_df) * 20 / 100))
        training_df = training_df.drop(test_df.index)

    train_start_time = time.time()

    space = _get_space(job)

    max_evaluations = job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()
        ).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT?
    trials = Trials()

    algorithm = _choose_algorithm(job)

    try:
        fmin(_calculate_and_evaluate, space, algo=algorithm.suggest, max_evals=max_evaluations, trials=trials)
    except ValueError:
        raise ValueError("All jobs failed, cannot find best configuration")
    current_best = {'loss': 100, 'results': {}, 'predictive_model_id': {}, 'model_split': {}, 'config': {}}
    for trial in trials:
        a = trial['result']
        if current_best['loss'] > a['loss']:
            current_best = a

    job.predictive_model = PredictiveModel.objects.filter(pk=current_best['predictive_model_id'])[0]
    job.predictive_model.save()
    job.save()

    current_best['results']['elapsed_time'] = timedelta(seconds=time.time() - train_start_time)  # todo find better place for this
    job.evaluation.elapsed_time = current_best['results']['elapsed_time']
    job.evaluation.save()

    #TODO evaluate on validation set
    if holdout:
        results_df, auc = _test(
            current_best['model_split'],
            validation_df.drop(['trace_id'], 1),
            evaluation=True,
            is_binary_classifier=_check_is_binary_classifier(job.labelling.type)
        )
        results = _prepare_results(results_df, auc)
        results['elapsed_time'] = job.evaluation.elapsed_time
        job.evaluation = Evaluation.init(
            job.predictive_model.predictive_model,
            results,
            len(set(test_df['label'])) <= 2
        )
        job.evaluation.save()
        job.save()

    if holdout:
        logger.info("End hyperopt job {}, {}. \n\tResults on test {}. \n\tResults on validation {}.".format(job.type, get_run(job), current_best['results'], results))
        return results, current_best['config'], current_best['model_split']
    else:
        logger.info("End hyperopt job {}, {}. \n\tResults on test {}.".format(job.type, get_run(job), current_best['results']))
        return current_best['results'], current_best['config'], current_best['model_split']