def create_prediction_job(job: Job, max_len: int) -> Job: new_job = duplicate_orm_row(job) new_job.type = JobTypes.PREDICTION.value new_job.status = JobStatuses.CREATED.value new_encoding = duplicate_orm_row( Encoding.objects.filter(pk=job.encoding.id)[0]) new_encoding.prefix_length = max_len new_encoding.save() new_job.encoding = new_encoding new_job.create_models = True new_job.save() return new_job
def replay_prediction_task(replay_prediction_job, training_initial_job, log): logger.info("Start replay_prediction task ID {}".format( replay_prediction_job.id)) try: replay_prediction_job.status = JobStatuses.RUNNING.value replay_prediction_job.save() max_len = max(len(trace) for trace in log) if replay_prediction_job.encoding.prefix_length != max_len: prediction_job = create_prediction_job(training_initial_job, max_len) prediction_task(prediction_job.id) prediction_job.refresh_from_db() new_replay_prediction_job = duplicate_orm_row(prediction_job) new_replay_prediction_job.split = Split.objects.filter( pk=replay_prediction_job.split.id)[0] new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value new_replay_prediction_job.status = JobStatuses.CREATED.value replay_prediction_task(new_replay_prediction_job, prediction_job, log) return result = replay_prediction_calculate(replay_prediction_job, log) replay_prediction_job.results = {'result': str(result)} replay_prediction_job.status = JobStatuses.COMPLETED.value replay_prediction_job.error = '' except Exception as e: logger.error(e) replay_prediction_job.status = JobStatuses.ERROR.value replay_prediction_job.error = str(e.__repr__()) raise e finally: replay_prediction_job.save() publish(replay_prediction_job)
def regression(training_df: DataFrame, test_df: DataFrame, clusterer: Clustering, job: Job) -> (dict, dict): """main regression entry point train and tests the regressor using the provided data :param clusterer: :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: predictive_model scores and split """ train_data, test_data = _prep_data(training_df, test_df) job.encoding = duplicate_orm_row( Encoding.objects.filter(pk=job.encoding.pk)[0] ) # TODO: maybe here would be better an intelligent get_or_create... job.encoding.features = list(train_data.columns.values) job.encoding.save() job.save() model_split = _train(train_data, _choose_regressor(job), clusterer) results_df = _test(model_split, test_data) results = calculate_results_regression(results_df, job.labelling) return results, model_split
def classification(training_df: DataFrame, test_df: DataFrame, clusterer: Clustering, job: Job) -> (dict, dict): """main classification entry point train and tests the classifier using the provided data :param clusterer: :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: predictive_model scores and split """ train_data = _drop_columns(training_df) test_data = _drop_columns(test_df) job.encoding = duplicate_orm_row( job.encoding ) #TODO: maybe here would be better an intelligent get_or_create... job.encoding.features = list(train_data.columns.values) job.encoding.save() job.save() model_split = _train(train_data, _choose_classifier(job), clusterer) results_df, auc = _test(model_split, test_data, evaluation=True, is_binary_classifier=_check_is_binary_classifier( job.labelling.type)) results = _prepare_results(results_df, auc) return results, model_split
def set_model_name(job: Job) -> None: if job.create_models: if job.predictive_model.model_path != '': job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0]) job.predictive_model.save() job.save() if job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value: job.clustering.model_path = 'cache/model_cache/job_{}-split_{}-clusterer-{}-v0.sav'.format( job.id, job.split.id, job.type) job.clustering.save() if job.type == JobTypes.UPDATE.value: job.type = JobTypes.PREDICTION.value #TODO: Y am I doing this? predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v{}.sav'.format( job.id, job.split.id, job.type, str(time.time())) else: predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v0.sav'.format( job.id, job.split.id, job.type) job.predictive_model.model_path = predictive_model_filename job.predictive_model.save() job.save()
def post_replay(request): """ Post request to start a demo of a log arriving to server :param request: json :return: Response """ jobs = [] data = request.data split_id = int(data['splitId']) job_id = int(data['jobId']) split = Split.objects.get(pk=split_id) try: training_initial_job = Job.objects.get(pk=job_id) new_job = duplicate_orm_row(training_initial_job) new_job.type = JobTypes.REPLAY.value new_job.status = JobStatuses.CREATED.value new_job.split = split new_job.save() except Job.DoesNotExist: return Response({'error': 'Job ' + str(job_id) + ' not in database'}, status=status.HTTP_404_NOT_FOUND) django_rq.enqueue(replay_task, new_job, training_initial_job) serializer = JobSerializer(jobs, many=True) return Response(serializer.data, status=status.HTTP_201_CREATED)
def post_replay_prediction(request): """ Post request to have a single prediction during the replay of a log :param request: json :return: Response """ jobs = [] job_id = int(request.query_params['jobId']) training_initial_job_id = int(request.query_params['training_job']) logger.info("Creating replay_prediction task") try: training_initial_job = Job.objects.get(pk=training_initial_job_id) replay_job = Job.objects.filter(pk=job_id)[0] replay_prediction_job = duplicate_orm_row(replay_job) replay_prediction_job.parent_job = Job.objects.filter(pk=job_id)[0] replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value replay_prediction_job.status = JobStatuses.CREATED.value replay_prediction_job.save() except Job.DoesNotExist: return Response({'error': 'Job ' + str(job_id) + ' not in database'}, status=status.HTTP_404_NOT_FOUND) logger.info("Enqueuing replay_prediction task ID {}".format( replay_prediction_job.id)) log = import_log_from_string(request.data.decode('utf-8')) django_rq.enqueue(replay_prediction_task, replay_prediction_job, training_initial_job, log) serializer = JobSerializer(jobs, many=True) return Response(serializer.data, status=status.HTTP_201_CREATED)
def post_prediction(request): """ Post request to have a single static prediction :param request: json :return: Response """ jobs = [] data = request.data job_id = int(data['jobId']) split_id = int(data['splitId']) split = Split.objects.get(pk=split_id) try: job = Job.objects.get(pk=job_id) new_job = duplicate_orm_row(job) new_job.type = JobTypes.RUNTIME.value new_job.status = JobStatuses.CREATED.value new_job.split = split new_job.save() except Job.DoesNotExist: return Response({'error': 'Job ' + str(job_id) + ' not in database'}, status=status.HTTP_404_NOT_FOUND) django_rq.enqueue(runtime_task, new_job) serializer = JobSerializer(jobs, many=True) return Response(serializer.data, status=status.HTTP_201_CREATED)
def create_prediction_job(job: Job, max_len: int) -> Job: """ The function create a new prediction job to create a model when it isn't in the database :param job: job dictionary :param max_len: job dictionary :return: Job """ new_job = duplicate_orm_row(job) new_job.type = JobTypes.PREDICTION.value new_job.status = JobStatuses.CREATED.value new_encoding = duplicate_orm_row(Encoding.objects.filter(pk=job.encoding.id)[0]) new_encoding.prefix_length = max_len new_encoding.save() new_job.encoding = new_encoding new_job.create_models = True new_job.save() return new_job
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame): """returns the encoded logs returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible :param job: job configuration :param use_cache: load or not saved datasets from cache :return: training and testing DataFrame """ print('\tGetting Dataset') if use_cache: if LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).exists(): training_df, test_df = get_labelled_logs(job) else: if job.split.train_log is not None and \ job.split.test_log is not None and \ LoadedLog.objects.filter(train_log=job.split.train_log.path, test_log=job.split.test_log.path).exists(): training_log, test_log, additional_columns = get_loaded_logs( job.split) else: training_log, test_log, additional_columns = prepare_logs( job.split) if job.split.type == SplitTypes.SPLIT_SINGLE.value: job.split = duplicate_orm_row(job.split) job.split.type = SplitTypes.SPLIT_DOUBLE.value train_name = '0-' + str( int(100 - (job.split.test_size * 100))) job.split.train_log = create_log(EventLog(training_log), train_name + '.xes') test_name = str(int(100 - (job.split.test_size * 100))) + '-100' job.split.test_log = create_log(EventLog(test_log), test_name + '.xes') job.split.additional_columns = str( train_name + test_name) # TODO: find better naming policy job.save() put_loaded_logs(job.split, training_log, test_log, additional_columns) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) put_labelled_logs(job, training_df, test_df) else: training_log, test_log, additional_columns = prepare_logs(job.split) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) return training_df, test_df
def _calculate_and_evaluate(args) -> dict: global trial_number if trial_number % 20 == 0: print("Trial {}".format(trial_number)) trial_number += 1 local_job = global_job predictive_model = local_job.predictive_model.predictive_model prediction_method = local_job.predictive_model.prediction_method model_config = { 'predictive_model': predictive_model, 'prediction_method': prediction_method, **args } new_predictive_model = PredictiveModel.init(model_config) local_job.predictive_model = duplicate_orm_row(new_predictive_model) local_job = duplicate_orm_row(local_job) performance_metric = local_job.hyperparameter_optimizer.__getattribute__( local_job.hyperparameter_optimizer.optimization_method.lower( )).performance_metric multiplier = _get_metric_multiplier(performance_metric) results, model_split = run_by_type(training_df.copy(), test_df.copy(), local_job) try: results, model_split = run_by_type(training_df.copy(), test_df.copy(), local_job) return { 'loss': -results[performance_metric] * multiplier, 'status': STATUS_OK, 'results': results, 'model_split': model_split, 'config': model_config } except: return { 'loss': 100, 'status': STATUS_FAIL, 'results': {}, 'config': {} }
def test_update(self): job = create_test_job() prediction_task(job.id) job2 = duplicate_orm_row(job) job.refresh_from_db() job2.incremental_train = job job2.type = JobTypes.UPDATE.value job2.save() initial_job = job2 #.to_dict() generated_job = update(split=job.split, payload={ 'type': 'classification', 'split_id': 1, 'config': { 'clusterings': ['noCluster'], 'encodings': ['simpleIndex'], 'encoding': { 'padding': False, 'prefix_length': 1, 'generation_type': 'only', 'add_remaining_time': False, 'add_elapsed_time': False, 'add_executed_events': False, 'add_resources_used': False, 'add_new_traces': False, 'features': [], }, 'create_models': False, 'methods': ['randomForest'], 'kmeans': {}, 'incremental_train': [job.id], 'hyperparameter_optimizer': { 'algorithm_type': 'tpe', 'max_evaluations': 10, 'performance_metric': 'rmse', 'type': 'none', }, 'labelling': { 'type': 'next_activity', 'attribute_name': '', 'threshold_type': 'threshold_mean', 'threshold': 0, } } })[0] #.to_dict()
def test_replay(self): job = create_test_job() runtime_job = duplicate_orm_row(job) runtime_log = create_test_log( log_name='runtime_example.xes', log_path='cache/log_cache/test_logs/runtime_test.xes') runtime_job.split = create_test_split( split_type=SplitTypes.SPLIT_DOUBLE.value, split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value, train_log=runtime_log, test_log=runtime_log) requests = replay_task(runtime_job, job) self.assertEqual(len(requests), 2)
def save_randomised_set(initial_split_obj): # todo: save new dataset in memory and create split to use it new_split = duplicate_orm_row(initial_split_obj) # TODO future bug creates shadows, train_log = Log.objects.get_or_create( name='RETRAIN' + new_split.train_log.name, path='cache/log_cache/' + 'RETRAIN' + new_split.train_log.name, properties={})[0] test_log = Log.objects.get_or_create( name='RETRAIN' + new_split.test_log.name, path='cache/log_cache/' + 'RETRAIN' + new_split.test_log.name, properties={})[0] new_split.train_log = train_log new_split.test_log = test_log new_split.additional_columns = None new_split.save() return new_split
def get_prediction(request, pk, explanation_target): """ Post request to start a demo of a log arriving to server :param pk: :param explanation_target: :param request: json :return: Response """ try: training_initial_job = Job.objects.get(pk=pk) new_job = duplicate_orm_row(training_initial_job) new_job.type = JobTypes.REPLAY.value new_job.status = JobStatuses.CREATED.value new_job.save() except Job.DoesNotExist: return Response({'error': 'Job ' + str(pk) + ' not in database'}, status=status.HTTP_404_NOT_FOUND) return Response(replay_predictions(new_job, Job.objects.get(pk=pk), explanation_target), status=status.HTTP_200_OK)
def replay_prediction_task(replay_prediction_job: Job, training_initial_job: Job, log: Log): """ The function create a replat prediction task to ask a single prediction to the server for a portion of a trace :param replay_prediction_job: job dictionary :param training_initial_job: job dictionary :param log: job dictionary """ logger.info("Start replay_prediction task ID {}".format(replay_prediction_job.id)) try: replay_prediction_job.status = JobStatuses.RUNNING.value replay_prediction_job.save() max_len = max(len(trace) for trace in log) if replay_prediction_job.encoding.prefix_length != max_len: prediction_job = create_prediction_job(training_initial_job, max_len) prediction_task(prediction_job.id) prediction_job.refresh_from_db() new_replay_prediction_job = duplicate_orm_row(prediction_job) new_replay_prediction_job.split = Split.objects.filter(pk=replay_prediction_job.split.id)[0] new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value new_replay_prediction_job.parent_job = replay_prediction_job.parent_job new_replay_prediction_job.status = JobStatuses.CREATED.value replay_prediction_task(new_replay_prediction_job, prediction_job, log) return result_dict, events_for_trace = replay_prediction_calculate(replay_prediction_job, log) replay_prediction_job.results = dict(result_dict) replay_prediction_job.event_number = dict(events_for_trace) replay_prediction_job.status = JobStatuses.COMPLETED.value replay_prediction_job.error = '' except Exception as e: logger.error(e) replay_prediction_job.status = JobStatuses.ERROR.value replay_prediction_job.error = str(e.__repr__()) raise e finally: replay_prediction_job.save() publish(replay_prediction_job)
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame): """returns the encoded logs returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible :param job: job configuration :param use_cache: load or not saved datasets from cache :return: training and testing DataFrame """ logger.info('\tGetting Dataset') if use_cache and \ (job.predictive_model is not None and job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value): if LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).exists(): try: training_df, test_df = get_labelled_logs(job) except FileNotFoundError: #cache invalidation LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).delete() logger.info('\t\tError pre-labeled cache invalidated!') return get_encoded_logs(job, use_cache) else: if job.split.train_log is not None and \ job.split.test_log is not None and \ LoadedLog.objects.filter(split=job.split).exists(): try: training_log, test_log, additional_columns = get_loaded_logs(job.split) except FileNotFoundError: # cache invalidation LoadedLog.objects.filter(split=job.split).delete() logger.info('\t\tError pre-loaded cache invalidated!') return get_encoded_logs(job, use_cache) else: training_log, test_log, additional_columns = get_train_test_log(job.split) if job.split.type == SplitTypes.SPLIT_SINGLE.value: job.split = duplicate_orm_row(Split.objects.filter(pk=job.split.pk)[0]) job.split.type = SplitTypes.SPLIT_DOUBLE.value train_name = '0-' + str(int(100 - (job.split.test_size * 100))) job.split.train_log = create_log( EventLog(training_log), train_name + '.xes' ) test_name = str(int(100 - (job.split.test_size * 100))) + '-100' job.split.test_log = create_log( EventLog(test_log), test_name + '.xes' ) job.split.additional_columns = str(train_name + test_name) # TODO: find better naming policy job.split.save() put_loaded_logs(job.split, training_log, test_log, additional_columns) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) put_labelled_logs(job, training_df, test_df) else: training_log, test_log, additional_columns = get_train_test_log(job.split) training_df, test_df = encode_label_logs(training_log, test_log, job, additional_columns=additional_columns) return training_df, test_df
def check_predictive_model_not_overwrite(job: Job) -> None: if job.hyperparameter_optimizer.optimization_method != HyperparameterOptimizationMethods.NONE.value: job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0]) job.predictive_model.save() job.save()
def handle(self, *args, **kwargs): TARGET_JOB = 439 initial_job_obj = Job.objects.filter(pk=TARGET_JOB)[0] # todo: return performances print('Initial Job:', initial_job_obj.evaluation.classificationmetrics ) # TODO future bug training_df_old, test_df_old = get_encoded_logs(initial_job_obj) training_df = training_df_old.copy() test_df = test_df_old.copy() # todo: what should I randomise? TARGETS = [ [('prefix_1', 2)], # <- simple pattern [('prefix_2', 3)], # <- simple pattern [ ('prefix_3', 2), ('prefix_4', 3), ] # <- complex pattern ] for target in TARGETS: if len(target) == 1: target = target[0] for df in [training_df, test_df]: m_col = df[target[0]] del df[target[0]] target_values1 = list(set(m_col.values)) df[target[0]] = m_col.apply(lambda x: x if (x != target[ 1]) else random.choice(target_values1)) elif len(target) > 1: for df in [training_df, test_df]: m_col = df[[column for column, _ in target]] possible_values = {} for column, _ in target: possible_values[column] = list(set(df[column])) del df[column] df[[column for column, _ in target ]] = m_col.apply(lambda x: x if any( [x[column] != value for column, value in target]) else Series({ column: random.choice(possible_values[column]) for column, value in target }), axis=1) else: raise Exception('target list with unexpected value') assert not training_df.equals(training_df_old) assert not test_df.equals(test_df_old) # todo: save new dataset in memory and create split to use it initial_split_obj = initial_job_obj.split new_split = duplicate_orm_row(initial_split_obj) train_log = duplicate_orm_row(new_split.train_log) test_log = duplicate_orm_row(new_split.test_log) # TODO future bug creates shadows train_log.name = 'RETRAIN' + train_log.name train_log.path = 'cache/log_cache/' + train_log.name train_log.properties = {} test_log.name = 'RETRAIN' + test_log.name test_log.path = 'cache/log_cache/' + test_log.name test_log.properties = {} new_split.train_log = train_log new_split.test_log = test_log new_split.additional_columns = None new_split.save() prediction_job = create_prediction_job( initial_job_obj, initial_job_obj.encoding.prefix_length) prediction_job.split = new_split prediction_job.split.save() prediction_job.save() put_labelled_logs(prediction_job, training_df, test_df) # todo: build model prediction_task(prediction_job.id, do_publish_result=False) prediction_job.refresh_from_db() # todo: return performances print('Retrain Job:', prediction_job.evaluation.classificationmetrics) print('Done, cheers!')