def _split_timeline(self, size: float, one_ts: bool) -> None: """ Split an event log dataframe by time to peform split-validation. prefered method time splitting removing incomplete traces. If the testing set is smaller than the 10% of the log size the second method is sort by traces start and split taking the whole traces no matter if they are contained in the timeframe or not Parameters ---------- size : float, validation percentage. one_ts : bool, Support only one timestamp. """ # Split log data splitter = ls.LogSplitter(self.log.data) # train, valdn = splitter.split_log('random', size, one_ts) train, valdn = splitter.split_log('timeline_contained', size, one_ts) total_events = len(self.log.data) # Check size and change time splitting method if necesary if len(valdn) < int(total_events * 0.1): train, valdn = splitter.split_log('timeline_trace', size, one_ts) # Set splits key = 'end_timestamp' if one_ts else 'start_timestamp' valdn = pd.DataFrame(valdn) train = pd.DataFrame(train) # If the log is big sample train partition train = self._sample_log(train) # Save partitions self.log_valdn = (valdn.sort_values( key, ascending=True).reset_index(drop=True)) self.log_train = copy.deepcopy(self.log) self.log_train.set_data( train.sort_values( key, ascending=True).reset_index(drop=True).to_dict('records'))
def _split_timeline(self, size: float, one_ts: bool) -> None: """ Split an event log dataframe by time to peform split-validation. prefered method time splitting removing incomplete traces. If the testing set is smaller than the 10% of the log size the second method is sort by traces start and split taking the whole traces no matter if they are contained in the timeframe or not Parameters ---------- size : float, validation percentage. one_ts : bool, Support only one timestamp. """ # Split log data splitter = ls.LogSplitter(self.log) train, valdn = splitter.split_log('timeline_contained', size, one_ts) total_events = len(self.log) # Check size and change time splitting method if necesary if len(valdn) < int(total_events * 0.1): train, valdn = splitter.split_log('timeline_trace', size, one_ts) # Set splits key = 'end_timestamp' if one_ts else 'start_timestamp' valdn = pd.DataFrame(valdn) train = pd.DataFrame(train) valdn = valdn[~valdn.task.isin(['Start', 'End'])] train = train[~train.task.isin(['Start', 'End'])] self.log_valdn = (valdn.sort_values( key, ascending=True).reset_index(drop=True)) self.log_train = (train.sort_values( key, ascending=True).reset_index(drop=True))
def split_log_test(): # Event log reading column_names = { 'Case ID': 'caseid', 'Activity': 'task', 'lifecycle:transition': 'event_type', 'Resource': 'user' } settings = { 'timeformat': '%Y-%m-%dT%H:%M:%S.%f', 'column_names': column_names, 'one_timestamp': False, 'filter_d_attrib': True } log = lr.LogReader( os.path.join('tests', 'fixtures', 'PurchasingExample.xes'), settings) splitter = tl.LogSplitter(log.data) train, test = splitter.split_log('timeline_contained', 0.8, settings['one_timestamp']) print(len(log.data)) print(len(train)) print(len(test)) log_min = pd.DataFrame(log.data).start_timestamp.min() log_max = pd.DataFrame(log.data).end_timestamp.max() train_min = pd.DataFrame(train).start_timestamp.min() train_max = pd.DataFrame(train).end_timestamp.max() test_min = pd.DataFrame(test).start_timestamp.min() test_max = pd.DataFrame(test).end_timestamp.max() print(log_min) print(train_min) print(train_max) print(test_min) print(test_max) print(log_max) assert (log_min == train_min) assert (log_max == test_max) assert (train_max < log_max) assert (train_max < test_max) assert (train_max < test_min) print('##################') splitter2 = tl.LogSplitter(train) train, test = splitter2.split_log('timeline_contained', 0.8, settings['one_timestamp']) print(len(train)) print(len(test)) print('##################') train, test = splitter.split_log('timeline_trace', 0.8, settings['one_timestamp']) print(len(log.data)) print(len(train)) print(len(test)) log_min = pd.DataFrame(log.data).start_timestamp.min() log_max = pd.DataFrame(log.data).end_timestamp.max() train_min = pd.DataFrame(train).start_timestamp.min() train_max = pd.DataFrame(train).end_timestamp.max() test_min = pd.DataFrame(test).start_timestamp.min() test_max = pd.DataFrame(test).end_timestamp.max() print(log_min) print(train_min) print(train_max) print(test_min) print(test_max) print(log_max) assert (log_min == train_min) assert (log_max == test_max) assert (train_max < log_max) assert (train_max < test_max) print('##################') print(len(pd.DataFrame(log.data).caseid.unique())) splitter3 = tl.LogSplitter(log.data) train, test = splitter3.split_log('random', 0.8, settings['one_timestamp']) print(len(train.caseid.unique())) print(len(test.caseid.unique()))