Beispiel #1
0
    def _split_timeline(self, size: float, one_ts: bool) -> None:
        """
        Split an event log dataframe by time to peform split-validation.
        prefered method time splitting removing incomplete traces.
        If the testing set is smaller than the 10% of the log size
        the second method is sort by traces start and split taking the whole
        traces no matter if they are contained in the timeframe or not

        Parameters
        ----------
        size : float, validation percentage.
        one_ts : bool, Support only one timestamp.
        """
        # Split log data
        splitter = ls.LogSplitter(self.log.data)
        # train, valdn = splitter.split_log('random', size, one_ts)
        train, valdn = splitter.split_log('timeline_contained', size, one_ts)
        total_events = len(self.log.data)
        # Check size and change time splitting method if necesary
        if len(valdn) < int(total_events * 0.1):
            train, valdn = splitter.split_log('timeline_trace', size, one_ts)
        # Set splits
        key = 'end_timestamp' if one_ts else 'start_timestamp'
        valdn = pd.DataFrame(valdn)
        train = pd.DataFrame(train)
        # If the log is big sample train partition
        train = self._sample_log(train)
        # Save partitions
        self.log_valdn = (valdn.sort_values(
            key, ascending=True).reset_index(drop=True))
        self.log_train = copy.deepcopy(self.log)
        self.log_train.set_data(
            train.sort_values(
                key, ascending=True).reset_index(drop=True).to_dict('records'))
    def _split_timeline(self, size: float, one_ts: bool) -> None:
        """
        Split an event log dataframe by time to peform split-validation.
        prefered method time splitting removing incomplete traces.
        If the testing set is smaller than the 10% of the log size
        the second method is sort by traces start and split taking the whole
        traces no matter if they are contained in the timeframe or not

        Parameters
        ----------
        size : float, validation percentage.
        one_ts : bool, Support only one timestamp.
        """
        # Split log data
        splitter = ls.LogSplitter(self.log)
        train, valdn = splitter.split_log('timeline_contained', size, one_ts)
        total_events = len(self.log)
        # Check size and change time splitting method if necesary
        if len(valdn) < int(total_events * 0.1):
            train, valdn = splitter.split_log('timeline_trace', size, one_ts)
        # Set splits
        key = 'end_timestamp' if one_ts else 'start_timestamp'
        valdn = pd.DataFrame(valdn)
        train = pd.DataFrame(train)
        valdn = valdn[~valdn.task.isin(['Start', 'End'])]
        train = train[~train.task.isin(['Start', 'End'])]
        self.log_valdn = (valdn.sort_values(
            key, ascending=True).reset_index(drop=True))
        self.log_train = (train.sort_values(
            key, ascending=True).reset_index(drop=True))
def split_log_test():
    # Event log reading
    column_names = {
        'Case ID': 'caseid',
        'Activity': 'task',
        'lifecycle:transition': 'event_type',
        'Resource': 'user'
    }
    settings = {
        'timeformat': '%Y-%m-%dT%H:%M:%S.%f',
        'column_names': column_names,
        'one_timestamp': False,
        'filter_d_attrib': True
    }

    log = lr.LogReader(
        os.path.join('tests', 'fixtures', 'PurchasingExample.xes'), settings)

    splitter = tl.LogSplitter(log.data)
    train, test = splitter.split_log('timeline_contained', 0.8,
                                     settings['one_timestamp'])
    print(len(log.data))
    print(len(train))
    print(len(test))

    log_min = pd.DataFrame(log.data).start_timestamp.min()
    log_max = pd.DataFrame(log.data).end_timestamp.max()

    train_min = pd.DataFrame(train).start_timestamp.min()
    train_max = pd.DataFrame(train).end_timestamp.max()

    test_min = pd.DataFrame(test).start_timestamp.min()
    test_max = pd.DataFrame(test).end_timestamp.max()

    print(log_min)
    print(train_min)
    print(train_max)
    print(test_min)
    print(test_max)
    print(log_max)

    assert (log_min == train_min)
    assert (log_max == test_max)

    assert (train_max < log_max)
    assert (train_max < test_max)
    assert (train_max < test_min)
    print('##################')
    splitter2 = tl.LogSplitter(train)
    train, test = splitter2.split_log('timeline_contained', 0.8,
                                      settings['one_timestamp'])
    print(len(train))
    print(len(test))

    print('##################')
    train, test = splitter.split_log('timeline_trace', 0.8,
                                     settings['one_timestamp'])
    print(len(log.data))
    print(len(train))
    print(len(test))

    log_min = pd.DataFrame(log.data).start_timestamp.min()
    log_max = pd.DataFrame(log.data).end_timestamp.max()

    train_min = pd.DataFrame(train).start_timestamp.min()
    train_max = pd.DataFrame(train).end_timestamp.max()

    test_min = pd.DataFrame(test).start_timestamp.min()
    test_max = pd.DataFrame(test).end_timestamp.max()

    print(log_min)
    print(train_min)
    print(train_max)
    print(test_min)
    print(test_max)
    print(log_max)

    assert (log_min == train_min)
    assert (log_max == test_max)

    assert (train_max < log_max)
    assert (train_max < test_max)
    print('##################')
    print(len(pd.DataFrame(log.data).caseid.unique()))
    splitter3 = tl.LogSplitter(log.data)
    train, test = splitter3.split_log('random', 0.8, settings['one_timestamp'])
    print(len(train.caseid.unique()))
    print(len(test.caseid.unique()))