def set_model_name(job: Job) -> None: if job.create_models: if job.predictive_model.model_path != '': # job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0]) #todo: replace with simple CREATE job.predictive_model = PredictiveModel.init( job.predictive_model.get_full_dict( ) #todo: doublecheck me, are you sure get_full_dict is returning everything needed? ) #todo: futurebug if object changes job.predictive_model.save() job.save() if job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value: job.clustering.model_path = 'cache/model_cache/job_{}-split_{}-clusterer-{}-v0.sav'.format( job.id, job.split.id, job.type) job.clustering.save() if job.type == JobTypes.UPDATE.value: job.type = JobTypes.PREDICTION.value #TODO: Y am I doing this? predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v{}.sav'.format( job.id, job.split.id, job.type, str(time.time())) else: predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v0.sav'.format( job.id, job.split.id, job.type) job.predictive_model.model_path = predictive_model_filename job.predictive_model.save() job.save()
def create_test_predictive_model( predictive_model: str = PredictiveModels.CLASSIFICATION.value, prediction_method: str = ClassificationMethods.RANDOM_FOREST.value, configuration: dict = {}) -> PredictiveModel: pred_model = PredictiveModel.init( get_prediction_method_config(predictive_model, prediction_method, configuration)) return pred_model
def _calculate_and_evaluate(args) -> dict: global trial_number if trial_number % 20 == 0: logger.info("Trial {}".format(trial_number)) trial_number += 1 local_job = global_job predictive_model = local_job.predictive_model.predictive_model prediction_method = local_job.predictive_model.prediction_method model_config = { 'predictive_model': predictive_model, 'prediction_method': prediction_method, **args } new_predictive_model = PredictiveModel.init(model_config) local_job.predictive_model = duplicate_orm_row(new_predictive_model) local_job.predictive_model.save() local_job.save() # local_job = duplicate_orm_row(local_job) #TODO not sure it is ok to have this here. performance_metric = local_job.hyperparameter_optimizer.__getattribute__( local_job.hyperparameter_optimizer.optimization_method.lower( )).performance_metric multiplier = _get_metric_multiplier(performance_metric) try: results, model_split = run_by_type(training_df.copy(), test_df.copy(), local_job) return { 'loss': -results[performance_metric] * multiplier, 'status': STATUS_OK, 'results': results, 'predictive_model_id': local_job.predictive_model.pk, 'model_split': model_split, 'config': model_config } except: return { 'loss': 100, 'status': STATUS_FAIL, 'results': {}, 'predictive_model_id': {}, 'model_split': {}, 'config': {} }
def update(split, payload, generation_type=PredictiveModels.CLASSIFICATION.value): # TODO adapt to allow selecting the predictive_model to update jobs = [] config = payload['config'] labelling_config = config['labelling'] if 'labelling' in config else {} for method in payload['config']['methods']: for clustering in payload['config']['clusterings']: for incremental_base_model in payload['config']['incremental_train']: for encMethod in payload['config']['encodings']: encoding = payload['config']['encoding'] if encoding['generation_type'] == UP_TO: for i in range(1, encoding['prefix_length'] + 1): job, _ = Job.objects.get_or_create( status=JobStatuses.CREATED.value, type=JobTypes.UPDATE.value, split=split, encoding=Encoding.objects.get_or_create( # TODO fixme data_encoding=DataEncodings.LABEL_ENCODER.value, value_encoding=encMethod, add_elapsed_time=labelling_config.get('add_elapsed_time', False), add_remaining_time=labelling_config.get('add_remaining_time', False), add_executed_events=labelling_config.get('add_executed_events', False), add_resources_used=labelling_config.get('add_resources_used', False), add_new_traces=labelling_config.get('add_new_traces', False), prefix_length=i, # TODO static check? padding=True if config['encoding']['padding'] == 'zero_padding' else False, task_generation_type=config['encoding'].get('generation_type', 'only_this'), features=config['encoding'].get('features', []) )[0], labelling=Labelling.objects.get_or_create( type=labelling_config.get('type', None), # TODO static check? attribute_name=labelling_config.get('attribute_name', None), threshold_type=labelling_config.get('threshold_type', None), threshold=labelling_config.get('threshold', None) )[0] if labelling_config != {} else None, clustering=Clustering.init(clustering, configuration=config.get(clustering, {})), predictive_model=PredictiveModel.init( get_prediction_method_config(generation_type, method, payload) ), hyperparameter_optimizer=HyperparameterOptimization.init( config.get('hyperparameter_optimizer', None)), create_models=config.get('create_models', False), incremental_train=Job.objects.filter( pk=incremental_base_model )[0] ) check_predictive_model_not_overwrite(job) set_model_name(job) jobs.append(job) else: job, _ = Job.objects.get_or_create( status=JobStatuses.CREATED.value, type=JobTypes.UPDATE.value, split=split, encoding=Encoding.objects.get_or_create( # TODO fixme data_encoding=DataEncodings.LABEL_ENCODER.value, value_encoding=encMethod, add_elapsed_time=labelling_config.get('add_elapsed_time', False), add_remaining_time=labelling_config.get('add_remaining_time', False), add_executed_events=labelling_config.get('add_executed_events', False), add_resources_used=labelling_config.get('add_resources_used', False), add_new_traces=labelling_config.get('add_new_traces', False), prefix_length=config['encoding']['prefix_length'], # TODO static check? padding=True if config['encoding']['padding'] == 'zero_padding' else False, task_generation_type=config['encoding'].get('generation_type', 'only_this'), features=config['encoding'].get('features', []) )[0], labelling=Labelling.objects.get_or_create( type=labelling_config.get('type', None), # TODO static check? attribute_name=labelling_config.get('attribute_name', None), threshold_type=labelling_config.get('threshold_type', None), threshold=labelling_config.get('threshold', None) )[0] if labelling_config != {} else None, clustering=Clustering.init(clustering, configuration=config.get(clustering, {})), predictive_model=PredictiveModel.init( get_prediction_method_config(generation_type, method, payload) ), hyperparameter_optimizer=HyperparameterOptimization.init( config.get('hyperparameter_optimizer', None)), create_models=config.get('create_models', False), incremental_train=Job.objects.filter( pk=incremental_base_model )[0] ) check_predictive_model_not_overwrite(job) set_model_name(job) jobs.append(job) return jobs
def generate(split, payload): jobs = [] config = payload['config'] labelling_config = config['labelling'] if 'labelling' in config else {} job_type = JobTypes.PREDICTION.value prediction_type = payload['type'] for method in config['methods']: for clustering in config['clusterings']: for encMethod in config['encodings']: encoding = config['encoding'] if encoding['generation_type'] == UP_TO: for i in range(1, encoding['prefix_length'] + 1): encoding = Encoding.objects.get_or_create( data_encoding=DataEncodings.LABEL_ENCODER.value, value_encoding=encMethod, add_elapsed_time=labelling_config.get('add_elapsed_time', False), add_remaining_time=labelling_config.get('add_remaining_time', False), add_executed_events=labelling_config.get('add_executed_events', False), add_resources_used=labelling_config.get('add_resources_used', False), add_new_traces=labelling_config.get('add_new_traces', False), prefix_length=i, # TODO static check? padding=True if config['encoding']['padding'] == 'zero_padding' else False, task_generation_type=config['encoding'].get('generation_type', 'only_this'), features=config['encoding'].get('features', []) )[0] predictive_model = PredictiveModel.init( get_prediction_method_config(prediction_type, method, config)) job = Job.objects.get_or_create( status=JobStatuses.CREATED.value, type=job_type, split=split, encoding=encoding, labelling=Labelling.objects.get_or_create( type=labelling_config.get('type', None), # TODO static check? attribute_name=labelling_config.get('attribute_name', None), threshold_type=labelling_config.get('threshold_type', None), threshold=labelling_config.get('threshold', None) )[0] if labelling_config != {} else None, clustering=Clustering.init(clustering, configuration=config.get(clustering, {})) if predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value else Clustering.init(ClusteringMethods.NO_CLUSTER.value, configuration={}), # TODO TEMPORARY workaround, hyperparameter_optimizer=HyperparameterOptimization.init( config.get('hyperparameter_optimizer', { 'type': None}) if predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value else { 'type': None}), # TODO TEMPORARY workaround predictive_model=predictive_model, create_models=config.get('create_models', False) )[0] check_predictive_model_not_overwrite(job) set_model_name(job) jobs.append(job) else: predictive_model = PredictiveModel.init( get_prediction_method_config(prediction_type, method, config)) job = Job.objects.get_or_create( status=JobStatuses.CREATED.value, type=job_type, split=split, encoding=Encoding.objects.get_or_create( data_encoding=DataEncodings.LABEL_ENCODER.value, value_encoding=encMethod, add_elapsed_time=labelling_config.get('add_elapsed_time', False), add_remaining_time=labelling_config.get('add_remaining_time', False), add_executed_events=labelling_config.get('add_executed_events', False), add_resources_used=labelling_config.get('add_resources_used', False), add_new_traces=labelling_config.get('add_new_traces', False), prefix_length=config['encoding']['prefix_length'], # TODO static check? padding=True if config['encoding']['padding'] == 'zero_padding' else False, task_generation_type=config['encoding'].get('generation_type', 'only_this'), features=config['encoding'].get('features', []) )[0], labelling=Labelling.objects.get_or_create( type=labelling_config.get('type', None), # TODO static check? attribute_name=labelling_config.get('attribute_name', None), threshold_type=labelling_config.get('threshold_type', None), threshold=labelling_config.get('threshold', None) )[0] if labelling_config != {} else None, clustering=Clustering.init(clustering, configuration=config.get(clustering, {})) if predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value else Clustering.init(ClusteringMethods.NO_CLUSTER.value, configuration={}), hyperparameter_optimizer=HyperparameterOptimization.init( config.get('hyperparameter_optimizer', { 'type': 'none'}) if predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value else { 'type': 'none'}), # TODO TEMPORARY workaround predictive_model=predictive_model, create_models=config.get('create_models', False) )[0] check_predictive_model_not_overwrite(job) set_model_name(job) jobs.append(job) return jobs
def generate(split, payload): jobs = [] config = payload['config'] labelling_config = config['labelling'] if 'labelling' in config else {} job_type = JobTypes.PREDICTION.value prediction_type = payload['type'] for method in config['methods']: for clustering in config['clusterings']: for encMethod in config['encodings']: encoding = config['encoding'] if encoding['generation_type'] == UP_TO: for i in range(1, encoding['prefix_length'] + 1): encoding = Encoding.objects.get_or_create( data_encoding='label_encoder', value_encoding=encMethod, add_elapsed_time=labelling_config.get( 'add_elapsed_time', False), add_remaining_time=labelling_config.get( 'add_remaining_time', False), add_executed_events=labelling_config.get( 'add_executed_events', False), add_resources_used=labelling_config.get( 'add_resources_used', False), add_new_traces=labelling_config.get( 'add_new_traces', False), prefix_length=i, # TODO static check? padding=True if config['encoding']['padding'] == 'zero_padding' else False, task_generation_type=config['encoding'].get( 'generation_type', 'only_this'))[0] predictive_model = PredictiveModel.init( get_prediction_method_config( prediction_type, method, config)) job = Job.objects.get_or_create( status=JobStatuses.CREATED.value, type=job_type, split=split, encoding=encoding, labelling=Labelling.objects.get_or_create( type=labelling_config.get('type', None), # TODO static check? attribute_name=labelling_config.get( 'attribute_name', None), threshold_type=labelling_config.get( 'threshold_type', None), threshold=labelling_config.get( 'threshold', None))[0] if labelling_config != {} else None, clustering=Clustering.init( clustering, configuration=config.get(clustering, {})), predictive_model=predictive_model)[0] jobs.append(job) else: predictive_model = PredictiveModel.init( get_prediction_method_config(prediction_type, method, config)) job = Job.objects.get_or_create( status=JobStatuses.CREATED.value, type=job_type, split=split, encoding=Encoding.objects.get_or_create( data_encoding='label_encoder', value_encoding=encMethod, add_elapsed_time=labelling_config.get( 'add_elapsed_time', False), add_remaining_time=labelling_config.get( 'add_remaining_time', False), add_executed_events=labelling_config.get( 'add_executed_events', False), add_resources_used=labelling_config.get( 'add_resources_used', False), add_new_traces=labelling_config.get( 'add_new_traces', False), prefix_length=config['encoding']['prefix_length'], # TODO static check? padding=True if config['encoding']['padding'] == 'zero_padding' else False, task_generation_type=config['encoding'].get( 'generation_type', 'only_this'))[0], labelling=Labelling.objects.get_or_create( type=labelling_config.get('type', None), # TODO static check? attribute_name=labelling_config.get( 'attribute_name', None), threshold_type=labelling_config.get( 'threshold_type', None), threshold=labelling_config.get('threshold', None))[0] if labelling_config != {} else None, clustering=Clustering.init(clustering, configuration=config.get( clustering, {})), predictive_model=predictive_model)[0] jobs.append(job) return jobs
def progetto_padova(): JOB = Job.objects.get_or_create( status=JobStatuses.CREATED.value, type=JobTypes.PREDICTION.value, # split=Split.objects.get_or_create( # this creates the split of the log # type=SplitTypes.SPLIT_SINGLE.value, # original_log=create_log( # this imports the log # import_log(BASE_DIR + RELATIVE_TRAIN_PATH), # RELATIVE_TRAIN_PATH, # BASE_DIR, # import_in_cache=False # ), # splitting_method=SplitOrderingMethods.SPLIT_TEMPORAL.value, # test_size=0.2 # )[0], split=Split.objects.get_or_create( # this creates the split of the log type=SplitTypes.SPLIT_DOUBLE.value, train_log=create_log( # this imports the log import_log(BASE_DIR + RELATIVE_TRAIN_PATH), RELATIVE_TRAIN_PATH, BASE_DIR, import_in_cache=False ), test_log=create_log( # this imports the log import_log(BASE_DIR + RELATIVE_VALIDATION_PATH), RELATIVE_VALIDATION_PATH, BASE_DIR, import_in_cache=False ) )[0], encoding=Encoding.objects.get_or_create( # this defines the encoding method data_encoding=DataEncodings.LABEL_ENCODER.value, value_encoding=ValueEncodings.SIMPLE_INDEX.value, add_elapsed_time=False, add_remaining_time=False, add_executed_events=False, add_resources_used=False, add_new_traces=False, prefix_length=5, padding=True, task_generation_type=TaskGenerationTypes.ALL_IN_ONE.value, features=[] )[0], labelling=Labelling.objects.get_or_create( # this defines the label type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label', threshold_type=None, threshold=None )[0], clustering=Clustering.init(ClusteringMethods.NO_CLUSTER.value, configuration={}), predictive_model=PredictiveModel.init( # this defines the predictive model get_prediction_method_config( PredictiveModels.CLASSIFICATION.value, ClassificationMethods.DECISION_TREE.value, payload={ 'max_depth': 2, 'min_samples_split': 2, 'min_samples_leaf': 2 } ) ), hyperparameter_optimizer=HyperparameterOptimization.init({ # this defines the hyperparameter optimisation procedure 'type': HyperparameterOptimizationMethods.HYPEROPT.value, 'max_evaluations': 10, 'performance_metric': HyperOptAlgorithms.TPE.value, 'algorithm_type': HyperOptLosses.AUC.value }), create_models=True )[0] # load log train_log, test_log, additional_columns = get_train_test_log(JOB.split) # encode train_df, test_df = encode_label_logs(train_log, test_log, JOB) # train + evaluate results, model_split = MODEL[JOB.predictive_model.predictive_model][ModelActions.BUILD_MODEL_AND_TEST.value]( train_df, test_df, _init_clusterer(JOB.clustering, train_df), JOB ) if JOB.create_models: check_predictive_model_not_overwrite(JOB) set_model_name(JOB) save_models(model_split, JOB) # predict data_df = pd.concat([train_df, test_df]) results = MODEL[JOB.predictive_model.predictive_model][ModelActions.PREDICT.value](JOB, data_df) results_with_probs = MODEL[JOB.predictive_model.predictive_model][ModelActions.PREDICT_PROBA.value](JOB, data_df) # lime exp = Explanation.objects.get_or_create( type=ExplanationTypes.LIME.value, split=JOB.split, # this defines the analysed log, you can use a different one from the training one predictive_model=JOB.predictive_model, job=JOB )[0] error, result = explanation(exp.id, int(EXPLANATION_TARGET))