Ejemplo n.º 1
0
def set_model_name(job: Job) -> None:
    if job.create_models:
        if job.predictive_model.model_path != '':
            job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0])
            job.predictive_model.save()
            job.save()

        if job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value:
            job.clustering.model_path = 'cache/model_cache/job_{}-split_{}-clusterer-{}-v0.sav'.format(
                job.id,
                job.split.id,
                job.type)
            job.clustering.save()

        if job.type == JobTypes.UPDATE.value:
            job.type = JobTypes.PREDICTION.value #TODO: Y am I doing this?
            predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v{}.sav'.format(
                job.id,
                job.split.id,
                job.type,
                str(time.time()))
        else:
            predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v0.sav'.format(
                job.id,
                job.split.id,
                job.type)
        job.predictive_model.model_path = predictive_model_filename
        job.predictive_model.save()
        job.save()
Ejemplo n.º 2
0
def set_model_name(job: Job) -> None:
    if job.create_models:
        if job.predictive_model.model_path != '':
            # job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0])  #todo: replace with simple CREATE
            job.predictive_model = PredictiveModel.init(
                job.predictive_model.get_full_dict(
                )  #todo: doublecheck me, are you sure get_full_dict is returning everything needed?
            )  #todo: futurebug if object changes
            job.predictive_model.save()
            job.save()

        if job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value:
            job.clustering.model_path = 'cache/model_cache/job_{}-split_{}-clusterer-{}-v0.sav'.format(
                job.id, job.split.id, job.type)
            job.clustering.save()

        if job.type == JobTypes.UPDATE.value:
            job.type = JobTypes.PREDICTION.value  #TODO: Y am I doing this?
            predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v{}.sav'.format(
                job.id, job.split.id, job.type, str(time.time()))
        else:
            predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v0.sav'.format(
                job.id, job.split.id, job.type)
        job.predictive_model.model_path = predictive_model_filename
        job.predictive_model.save()
        job.save()
Ejemplo n.º 3
0
def calculate_hyperopt(job: Job) -> (dict, dict, dict):
    """main entry method for hyperopt calculations
    returns the predictive_model for the best trial

    :param job: job configuration
    :return: tuple containing the results, config and predictive_model split from the search
    """

    logger.info("Start hyperopt job {} with {}, performance_metric {}".format(
        job.type, get_run(job),
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()).
        performance_metric)  #Todo: WHY DO I NEED TO GET HYPEROPT?
                )

    global training_df, test_df, global_job
    global_job = job
    training_df, test_df = get_encoded_logs(job)

    space = _get_space(job)

    max_evaluations = job.hyperparameter_optimizer.__getattribute__(
        job.hyperparameter_optimizer.optimization_method.lower(
        )).max_evaluations  #Todo: WHY DO I NEED TO GET HYPEROPT?
    trials = Trials()

    algorithm = _choose_algorithm(job)

    try:
        fmin(_calculate_and_evaluate,
             space,
             algo=algorithm.suggest,
             max_evals=max_evaluations,
             trials=trials)
    except ValueError:
        raise ValueError("All jobs failed, cannot find best configuration")
    current_best = {
        'loss': 100,
        'results': {},
        'predictive_model_id': {},
        'model_split': {},
        'config': {}
    }
    for trial in trials:
        a = trial['result']
        if current_best['loss'] > a['loss']:
            current_best = a

    job.predictive_model = PredictiveModel.objects.filter(
        pk=current_best['predictive_model_id'])[0]
    job.save()

    logger.info("End hyperopt job {}, {} . Results {}".format(
        job.type, get_run(job), current_best['results']))
    return current_best['results'], current_best['config'], current_best[
        'model_split']
Ejemplo n.º 4
0
def calculate_hyperopt(job: Job) -> (dict, dict, dict):
    """main entry method for hyperopt calculations
    returns the predictive_model for the best trial

    :param job: job configuration
    :return: tuple containing the results, config and predictive_model split from the search
    """

    logger.info("Start hyperopt job {} with {}, performance_metric {}".format(
        job.type, get_run(job),
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()).
        performance_metric)  #Todo: WHY DO I NEED TO GET HYPEROPT?
                )

    global train_df, validation_df, test_df, global_job
    global_job = job
    train_df, test_df = get_encoded_logs(job)
    train_df, validation_df, test_df = _retrieve_train_validate_test(
        train_df, test_df)

    train_start_time = time.time()

    space = _get_space(job)

    max_evaluations = job.hyperparameter_optimizer.__getattribute__(
        job.hyperparameter_optimizer.optimization_method.lower(
        )).max_evaluations  #Todo: WHY DO I NEED TO GET HYPEROPT?
    trials = Trials()

    algorithm = algorithm = OPTIMISATION_ALGORITHM[
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower(
            )).algorithm_type]
    _run_hyperoptimisation(space, algorithm.suggest, max_evaluations, trials)

    best_candidate = trials.best_trial['result']

    job.predictive_model = PredictiveModel.objects.filter(
        pk=best_candidate['predictive_model_id'])[0]
    job.predictive_model.save()
    job.save()

    best_candidate['results']['elapsed_time'] = timedelta(
        seconds=time.time() -
        train_start_time)  # todo find better place for this
    job.evaluation.elapsed_time = best_candidate['results']['elapsed_time']
    job.evaluation.save()

    results_df, auc = _test_best_candidate(
        best_candidate, job.labelling.type,
        job.predictive_model.predictive_model)
    if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value:
        results = classification_prepare_results(results_df, auc)
    else:
        results = regression_prepare_results(results_df, job.labelling)
    results['elapsed_time'] = job.evaluation.elapsed_time
    job.evaluation = Evaluation.init(job.predictive_model.predictive_model,
                                     results,
                                     len(set(validation_df['label'])) <= 2)
    job.evaluation.save()
    job.save()

    logger.info(
        "End hyperopt job {}, {}. \n\tResults on validation {}. \n\tResults on test {}."
        .format(job.type, get_run(job), best_candidate['results'], results))  #
    return results, best_candidate['config'], best_candidate['model_split']
Ejemplo n.º 5
0
def check_predictive_model_not_overwrite(job: Job) -> None:
    if job.hyperparameter_optimizer.optimization_method != HyperparameterOptimizationMethods.NONE.value:
        job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0])
        job.predictive_model.save()
        job.save()
Ejemplo n.º 6
0
def calculate_hyperopt(job: Job) -> (dict, dict, dict):
    """main entry method for hyperopt calculations
    returns the predictive_model for the best trial

    :param job: job configuration
    :return: tuple containing the results, config and predictive_model split from the search
    """

    logger.info("Start hyperopt job {} with {}, performance_metric {}".format(
        job.type, get_run(job),
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()
        ).performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT?
    )

    global training_df, test_df, global_job
    global_job = job
    training_df, test_df = get_encoded_logs(job)
    #TODO evaluate on validation set
    if holdout:
        validation_df = test_df
        # test_df = training_df.sample(frac=.2)
        test_df = training_df.tail(int(len(training_df) * 20 / 100))
        training_df = training_df.drop(test_df.index)

    train_start_time = time.time()

    space = _get_space(job)

    max_evaluations = job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()
        ).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT?
    trials = Trials()

    algorithm = _choose_algorithm(job)

    try:
        fmin(_calculate_and_evaluate, space, algo=algorithm.suggest, max_evals=max_evaluations, trials=trials)
    except ValueError:
        raise ValueError("All jobs failed, cannot find best configuration")
    current_best = {'loss': 100, 'results': {}, 'predictive_model_id': {}, 'model_split': {}, 'config': {}}
    for trial in trials:
        a = trial['result']
        if current_best['loss'] > a['loss']:
            current_best = a

    job.predictive_model = PredictiveModel.objects.filter(pk=current_best['predictive_model_id'])[0]
    job.predictive_model.save()
    job.save()

    current_best['results']['elapsed_time'] = timedelta(seconds=time.time() - train_start_time)  # todo find better place for this
    job.evaluation.elapsed_time = current_best['results']['elapsed_time']
    job.evaluation.save()

    #TODO evaluate on validation set
    if holdout:
        results_df, auc = _test(
            current_best['model_split'],
            validation_df.drop(['trace_id'], 1),
            evaluation=True,
            is_binary_classifier=_check_is_binary_classifier(job.labelling.type)
        )
        results = _prepare_results(results_df, auc)
        results['elapsed_time'] = job.evaluation.elapsed_time
        job.evaluation = Evaluation.init(
            job.predictive_model.predictive_model,
            results,
            len(set(test_df['label'])) <= 2
        )
        job.evaluation.save()
        job.save()

    if holdout:
        logger.info("End hyperopt job {}, {}. \n\tResults on test {}. \n\tResults on validation {}.".format(job.type, get_run(job), current_best['results'], results))
        return results, current_best['config'], current_best['model_split']
    else:
        logger.info("End hyperopt job {}, {}. \n\tResults on test {}.".format(job.type, get_run(job), current_best['results']))
        return current_best['results'], current_best['config'], current_best['model_split']