def test_random(self): split = split_single( split_ordering=SplitOrderingMethods.SPLIT_RANDOM.value) training_log1, _, _ = get_train_test_log(split) training_log2, _, _ = get_train_test_log(split) training_names1 = trace_names(training_log1) training_names2 = trace_names(training_log2) self.assertNotEqual(training_names1, training_names2)
def retrieve_proper_encoder(job: Job) -> Encoder: if job.incremental_train is not None: return retrieve_proper_encoder(job.incremental_train) else: training_log, test_log, additional_columns = get_train_test_log(job.split) training_df, _ = encode_label_logs(training_log, test_log, job, additional_columns=additional_columns, encode=False) return Encoder(training_df, job.encoding)
def test_sequential(self): split = split_single( split_ordering=SplitOrderingMethods.SPLIT_SEQUENTIAL.value) training_log, test_log, _ = get_train_test_log(split) training_names = trace_names(training_log) test_names = trace_names(test_log) self.assertListEqual(['3', '2', '1', '6'], training_names) self.assertListEqual(['5', '4'], test_names)
def test_strict_temporal(self): split = split_single( split_ordering=SplitOrderingMethods.SPLIT_STRICT_TEMPORAL.value) training_log, test_log, _ = get_train_test_log(split) training_names = trace_names(training_log) test_names = trace_names(test_log) # Modified log to have only one trace here self.assertListEqual(['1'], sorted(training_names)) self.assertListEqual(sorted(['6', '4']), sorted(test_names))
def test_temporal(self): split = split_single( split_ordering=SplitOrderingMethods.SPLIT_TEMPORAL.value) training_log, test_log, _ = get_train_test_log(split) training_names = trace_names(training_log) test_names = trace_names(test_log) self.assertListEqual(sorted(['1', '2', '3', '5']), sorted(training_names)) self.assertListEqual(sorted(['6', '4']), sorted(test_names))
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame): """returns the encoded logs returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible :param job: job configuration :param use_cache: load or not saved datasets from cache :return: training and testing DataFrame """ logger.info('\tGetting Dataset') if use_cache and \ (job.predictive_model is not None and job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value): if LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).exists(): try: training_df, test_df = get_labelled_logs(job) except FileNotFoundError: #cache invalidation LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).delete() logger.info('\t\tError pre-labeled cache invalidated!') return get_encoded_logs(job, use_cache) else: if job.split.train_log is not None and \ job.split.test_log is not None and \ LoadedLog.objects.filter(split=job.split).exists(): try: training_log, test_log, additional_columns = get_loaded_logs( job.split) except FileNotFoundError: # cache invalidation LoadedLog.objects.filter(split=job.split).delete() logger.info('\t\tError pre-loaded cache invalidated!') return get_encoded_logs(job, use_cache) else: training_log, test_log, additional_columns = get_train_test_log( job.split) if job.split.type == SplitTypes.SPLIT_SINGLE.value: search_for_already_existing_split = Split.objects.filter( type=SplitTypes.SPLIT_DOUBLE.value, original_log=job.split.original_log, test_size=job.split.test_size, splitting_method=job.split.splitting_method) if len(search_for_already_existing_split) >= 1: job.split = search_for_already_existing_split[0] job.split.save() job.save() return get_encoded_logs(job, use_cache=use_cache) else: # job.split = duplicate_orm_row(Split.objects.filter(pk=job.split.pk)[0]) #todo: replace with simple CREATE job.split = Split.objects.create( type=job.split.type, original_log=job.split.original_log, test_size=job.split.test_size, splitting_method=job.split.splitting_method, train_log=job.split.train_log, test_log=job.split.test_log, additional_columns=job.split.additional_columns ) #todo: futurebug if object changes job.split.type = SplitTypes.SPLIT_DOUBLE.value train_name = 'SPLITTED_' + job.split.original_log.name.split( '.')[0] + '_0-' + str( int(100 - (job.split.test_size * 100))) job.split.train_log = create_log( EventLog(training_log), train_name + '.xes') test_name = 'SPLITTED_' + job.split.original_log.name.split( '.')[0] + '_' + str( int(100 - (job.split.test_size * 100))) + '-100' job.split.test_log = create_log( EventLog(test_log), test_name + '.xes') job.split.additional_columns = str( train_name + test_name) # TODO: find better naming policy job.split.save() put_loaded_logs(job.split, training_log, test_log, additional_columns) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) put_labelled_logs(job, training_df, test_df) else: training_log, test_log, additional_columns = get_train_test_log( job.split) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) return training_df, test_df
def progetto_padova(): JOB = Job.objects.get_or_create( status=JobStatuses.CREATED.value, type=JobTypes.PREDICTION.value, split=Split.objects.get_or_create( # this creates the split of the log type=SplitTypes.SPLIT_DOUBLE.value, train_log=create_log( # this imports the log import_log(BASE_DIR + RELATIVE_TRAIN_PATH), RELATIVE_TRAIN_PATH, BASE_DIR, import_in_cache=False), test_log=create_log( # this imports the log import_log(BASE_DIR + RELATIVE_VALIDATION_PATH), RELATIVE_VALIDATION_PATH, BASE_DIR, import_in_cache=False))[0], encoding=Encoding.objects. get_or_create( # this defines the encoding method data_encoding=DataEncodings.LABEL_ENCODER.value, value_encoding=ValueEncodings.SIMPLE_INDEX.value, add_elapsed_time=False, add_remaining_time=False, add_executed_events=False, add_resources_used=False, add_new_traces=False, prefix_length=5, padding=True, task_generation_type=TaskGenerationTypes.ALL_IN_ONE.value, features=[])[0], labelling=Labelling.objects.get_or_create( # this defines the label type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label', threshold_type=None, threshold=None)[0], clustering=Clustering.init(ClusteringMethods.NO_CLUSTER.value, configuration={}), predictive_model=PredictiveModel. init( # this defines the predictive model get_prediction_method_config( PredictiveModels.CLASSIFICATION.value, ClassificationMethods.DECISION_TREE.value, payload={ 'max_depth': 2, 'min_samples_split': 2, 'min_samples_leaf': 2 })), hyperparameter_optimizer=HyperparameterOptimization.init( { # this defines the hyperparameter optimisation procedure 'type': HyperparameterOptimizationMethods.HYPEROPT.value, 'max_evaluations': 10, 'performance_metric': HyperOptAlgorithms.TPE.value, 'algorithm_type': HyperOptLosses.AUC.value }), create_models=True)[0] # load log train_log, test_log, additional_columns = get_train_test_log(JOB.split) # encode train_df, test_df = encode_label_logs(train_log, test_log, JOB) # train + evaluate results, model_split = MODEL[JOB.predictive_model.predictive_model][ ModelActions.BUILD_MODEL_AND_TEST.value](train_df, test_df, _init_clusterer( JOB.clustering, train_df), JOB) if JOB.create_models: save_models(model_split, JOB) # predict data_df = pd.concat([train_df, test_df]) results = MODEL[JOB.predictive_model.predictive_model][ ModelActions.PREDICT.value](JOB, data_df) results = MODEL[JOB.predictive_model.predictive_model][ ModelActions.PREDICT_PROBA.value](JOB, data_df) # lime exp = Explanation.objects.get_or_create( type=ExplanationTypes.LIME.value, split=JOB. split, # this defines the analysed log, you can use a different one from the training one predictive_model=JOB.predictive_model, job=JOB)[0] error, result = explanation(exp.id, int(EXPLANATION_TARGET))
def test_size(self): split = split_single(test_size=0.5) training_log, test_log, _ = get_train_test_log(split) self.assertEqual(3, len(training_log)) self.assertEqual(3, len(test_log))
def test_split_double(self): training_log, test_log, _ = get_train_test_log(split_double()) self.assertEqual(4, len(training_log)) self.assertEqual(2, len(test_log))
def replay_core(replay_job: Job, training_initial_job: Job) -> list: """The function create a set with timestamps of events, then create a list of requests simulating the log in the time passing :param replay_job: job dictionary :param training_initial_job: job dictionary :return: List of requests """ split = replay_job.split log = get_log(split.train_log) requests_list = list() eventlog = EventLog() for key in log.attributes.keys(): eventlog.attributes[key] = log.attributes[key] for trace in log: new_trace = Trace(trace) for key in trace.attributes: new_trace.attributes[key] = trace.attributes[key] eventlog.append(new_trace) times = sorted( set([event['time:timestamp'] for trace in eventlog for event in trace])) for t in times[2::int((len(times) - 2) / 5)]: filtered_eventlog = timestamp_filter.apply_events( eventlog, times[0].replace(tzinfo=None), t.replace(tzinfo=None)) trace_list = list() event_number = dict() for trace in filtered_eventlog: trace_list.append(trace.attributes['concept:name']) event_number[trace.attributes['concept:name']] = len(trace) replay_job.case_id = trace_list replay_job.event_number = event_number replay_job.save() try: #TODO check logger usage logger.info("Sending request for replay_prediction task.") r = requests.post( url="http://server:8000/runtime/replay_prediction/", data=export_log_as_string(filtered_eventlog), params={ 'jobId': replay_job.id, 'training_job': training_initial_job.id }, headers={ 'Content-Type': 'text/plain', 'charset': 'UTF-8' }) requests_list.append(str(r)) except Exception as e: requests_list.append(str(e)) logger.warning(str(e)) training_log, test_log, additional_columns = get_train_test_log( replay_job.split) training_df, _ = encode_label_logs(training_log, test_log, replay_job, additional_columns=additional_columns) gold_values = dict(zip(training_df['trace_id'], training_df['label'])) parent_id = replay_job.id # final_job = duplicate_orm_row(replay_job) #todo: replace with simple CREATE final_job = Job.objects.create( created_date=replay_job.created_date, modified_date=replay_job.modified_date, error=replay_job.error, status=replay_job.status, type=replay_job.type, create_models=replay_job.create_models, case_id=replay_job.case_id, event_number=replay_job.event_number, gold_value=replay_job.gold_value, results=replay_job.results, parent_job=replay_job.parent_job, split=replay_job.split, encoding=replay_job.encoding, labelling=replay_job.labelling, clustering=replay_job.clustering, predictive_model=replay_job.predictive_model, evaluation=replay_job.evaluation, hyperparameter_optimizer=replay_job.hyperparameter_optimizer, incremental_train=replay_job.incremental_train) final_job.parent_job = Job.objects.filter(pk=parent_id)[0] final_job.gold_value = gold_values final_job.type = JobTypes.REPLAY_PREDICT.value final_job.save() return requests_list