def run_by_type(training_df: DataFrame, test_df: DataFrame, job: Job) -> (dict, dict): """runs the specified training/evaluation run :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: results and predictive_model split """ model_split = None start_time = time.time() if job.type == JobTypes.PREDICTION.value: clusterer = _init_clusterer(job.clustering, training_df) if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: results, model_split = classification(training_df, test_df, clusterer, job) elif job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value: results, model_split = regression(training_df, test_df, clusterer, job) elif job.predictive_model.predictive_model == PredictiveModels.TIME_SERIES_PREDICTION.value: results, model_split = time_series_prediction( training_df, test_df, clusterer, job) elif job.type == JobTypes.LABELLING.value: results = _label_task(training_df) elif job.type == JobTypes.UPDATE.value: results, model_split = update_and_test(training_df, test_df, job) else: raise ValueError("Type {} not supported".format(job.type)) # TODO: integrateme if job.type != JobTypes.LABELLING.value: results['elapsed_time'] = timedelta( seconds=time.time() - start_time) #todo find better place for this if job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results) elif job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results, len(model_split[ModelType.CLASSIFIER.value][0].classes_) <= 2) job.evaluation.save() elif job.type == JobTypes.LABELLING.value: job.labelling.results = results job.labelling.save() if job.type == PredictiveModels.CLASSIFICATION.value: #todo this is an old workaround I should remove this save_result(results, job, start_time) print("End job {}, {} .".format(job.type, get_run(job))) print("\tResults {} .".format(results)) return results, model_split
def run_by_type(training_df: DataFrame, test_df: DataFrame, job: Job) -> (dict, dict): """runs the specified training/evaluation run :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: results and predictive_model split """ model_split = None # TODO fixme this needs to be fixed in the interface # if job['incremental_train']['base_model'] is not None: # job['type'] = JobTypes.UPDATE.value start_time = time.time() if job.type == JobTypes.PREDICTION.value: clusterer = _init_clusterer(job.clustering, training_df) if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: results, model_split = classification(training_df, test_df, clusterer, job) elif job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value: results, model_split = regression(training_df, test_df, clusterer, job) elif job.predictive_model.predictive_model == PredictiveModels.TIME_SERIES_PREDICTION.value: results, model_split = time_series_prediction( training_df, test_df, clusterer, job) elif job.type == JobTypes.LABELLING.value: results = _label_task(training_df) elif job.type == JobTypes.UPDATE.value: results, model_split = update_and_test(training_df, test_df, job) else: raise ValueError("Type {} not supported".format(job.type)) # TODO: integrateme if job.type != JobTypes.LABELLING.value: if job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results) elif job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results, len(model_split[job.predictive_model.predictive_model] [0].classes_) <= 2) job.save() if job.type == PredictiveModels.CLASSIFICATION.value: save_result(results, job, start_time) print("End job {}, {} .".format(job.type, get_run(job))) print("\tResults {} .".format(results)) return results, model_split
def calculate_hyperopt(job: Job) -> (dict, dict, dict): """main entry method for hyperopt calculations returns the predictive_model for the best trial :param job: job configuration :return: tuple containing the results, config and predictive_model split from the search """ logger.info("Start hyperopt job {} with {}, performance_metric {}".format( job.type, get_run(job), job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower()). performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT? ) global train_df, validation_df, test_df, global_job global_job = job train_df, test_df = get_encoded_logs(job) train_df, validation_df, test_df = _retrieve_train_validate_test( train_df, test_df) train_start_time = time.time() space = _get_space(job) max_evaluations = job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower( )).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT? trials = Trials() algorithm = algorithm = OPTIMISATION_ALGORITHM[ job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower( )).algorithm_type] _run_hyperoptimisation(space, algorithm.suggest, max_evaluations, trials) best_candidate = trials.best_trial['result'] job.predictive_model = PredictiveModel.objects.filter( pk=best_candidate['predictive_model_id'])[0] job.predictive_model.save() job.save() best_candidate['results']['elapsed_time'] = timedelta( seconds=time.time() - train_start_time) # todo find better place for this job.evaluation.elapsed_time = best_candidate['results']['elapsed_time'] job.evaluation.save() results_df, auc = _test_best_candidate( best_candidate, job.labelling.type, job.predictive_model.predictive_model) if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: results = classification_prepare_results(results_df, auc) else: results = regression_prepare_results(results_df, job.labelling) results['elapsed_time'] = job.evaluation.elapsed_time job.evaluation = Evaluation.init(job.predictive_model.predictive_model, results, len(set(validation_df['label'])) <= 2) job.evaluation.save() job.save() logger.info( "End hyperopt job {}, {}. \n\tResults on validation {}. \n\tResults on test {}." .format(job.type, get_run(job), best_candidate['results'], results)) # return results, best_candidate['config'], best_candidate['model_split']
def run_by_type(training_df: DataFrame, test_df: DataFrame, job: Job) -> (dict, dict): """runs the specified training/evaluation run :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: results and predictive_model split """ model_split = None start_time = time.time() if job.type == JobTypes.PREDICTION.value: clusterer = init_clusterer(job.clustering, training_df) results, model_split = MODEL[job.predictive_model.predictive_model][ModelActions.BUILD_MODEL_AND_TEST.value](training_df, test_df, clusterer, job) elif job.type == JobTypes.LABELLING.value: results = _label_task(training_df) elif job.type == JobTypes.UPDATE.value: results, model_split = MODEL[job.predictive_model.predictive_model][ModelActions.UPDATE_AND_TEST.value](training_df, test_df, job) else: raise ValueError("Type {} not supported".format(job.type)) # TODO: integrateme if job.type != JobTypes.LABELLING.value: results['elapsed_time'] = timedelta(seconds=time.time() - start_time) #todo find better place for this if job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results ) elif job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results, len(set(test_df['label'])) <= 2 ) elif job.predictive_model.predictive_model == PredictiveModels.TIME_SERIES_PREDICTION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results ) job.evaluation.save() job.save() elif job.type == JobTypes.LABELLING.value: # job.labelling = duplicate_orm_row(job.labelling) #todo: replace with simple CREATE job.labelling = Labelling.objects.create( type=job.labelling.type, attribute_name=job.labelling.attribute_name, threshold_type=job.labelling.threshold_type, threshold=job.labelling.threshold ) #todo: futurebug if object changes job.labelling.results = results job.labelling.save() job.save() # if job.type == PredictiveModels.CLASSIFICATION.value: #todo this is an old workaround I should remove this # save_result(results, job, start_time) logger.info("End job {}, {} .".format(job.type, get_run(job))) logger.info("\tResults {} .".format(results)) return results, model_split
def calculate_hyperopt(job: Job) -> (dict, dict, dict): """main entry method for hyperopt calculations returns the predictive_model for the best trial :param job: job configuration :return: tuple containing the results, config and predictive_model split from the search """ logger.info("Start hyperopt job {} with {}, performance_metric {}".format( job.type, get_run(job), job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower() ).performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT? ) global training_df, test_df, global_job global_job = job training_df, test_df = get_encoded_logs(job) #TODO evaluate on validation set if holdout: validation_df = test_df # test_df = training_df.sample(frac=.2) test_df = training_df.tail(int(len(training_df) * 20 / 100)) training_df = training_df.drop(test_df.index) train_start_time = time.time() space = _get_space(job) max_evaluations = job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower() ).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT? trials = Trials() algorithm = _choose_algorithm(job) try: fmin(_calculate_and_evaluate, space, algo=algorithm.suggest, max_evals=max_evaluations, trials=trials) except ValueError: raise ValueError("All jobs failed, cannot find best configuration") current_best = {'loss': 100, 'results': {}, 'predictive_model_id': {}, 'model_split': {}, 'config': {}} for trial in trials: a = trial['result'] if current_best['loss'] > a['loss']: current_best = a job.predictive_model = PredictiveModel.objects.filter(pk=current_best['predictive_model_id'])[0] job.predictive_model.save() job.save() current_best['results']['elapsed_time'] = timedelta(seconds=time.time() - train_start_time) # todo find better place for this job.evaluation.elapsed_time = current_best['results']['elapsed_time'] job.evaluation.save() #TODO evaluate on validation set if holdout: results_df, auc = _test( current_best['model_split'], validation_df.drop(['trace_id'], 1), evaluation=True, is_binary_classifier=_check_is_binary_classifier(job.labelling.type) ) results = _prepare_results(results_df, auc) results['elapsed_time'] = job.evaluation.elapsed_time job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results, len(set(test_df['label'])) <= 2 ) job.evaluation.save() job.save() if holdout: logger.info("End hyperopt job {}, {}. \n\tResults on test {}. \n\tResults on validation {}.".format(job.type, get_run(job), current_best['results'], results)) return results, current_best['config'], current_best['model_split'] else: logger.info("End hyperopt job {}, {}. \n\tResults on test {}.".format(job.type, get_run(job), current_best['results'])) return current_best['results'], current_best['config'], current_best['model_split']