Ejemplo n.º 1
0
    def _get_variants(self):
        # variants
        variants = sorted(
            nx.all_simple_paths(self.graph,
                                source=self.start_event,
                                target=self.end_event))
        traces = [
            Case(id=i + 1, events=[Event(name=e) for e in v[1:-1]])
            for i, v in enumerate(variants)
        ]

        # probabilities
        def get_num_successors(x):
            return len(
                [edge[1] for edge in self.graph.edges() if edge[0] == x])

        probabilities = [
            np.product([1 / max(1, get_num_successors(node)) for node in path])
            for path in variants
        ]

        # set globally
        self._variants = EventLog(cases=traces)
        self._variant_probabilities = probabilities

        return self._variants, self._variant_probabilities
Ejemplo n.º 2
0
    def event_log(self):
        """Return the event log object of this dataset."""
        if self.dataset_name is None:
            raise ValueError(f'dataset {self.dataset_name} cannot be found')

        if self._event_log is None:
            self._event_log = EventLog.load(self.dataset_name)
        return self._event_log
Ejemplo n.º 3
0
    def load(self, dataset_name):
        """
        Load dataset from disk. If there exists a cached file, load from cache. If no cache file exists, load from
        Event Log and cache it.

        :param dataset_name:
        :return:
        """
        el_file = EventLogFile(dataset_name)
        self.dataset_name = el_file.name

        # Check for cache
        if el_file.cache_file.exists():
            self._load_dataset_from_cache(el_file.cache_file)

        # Else generate from event log
        elif el_file.path.exists():
            self._event_log = EventLog.load(el_file.path)
            self.from_event_log(self._event_log)
            self._cache_dataset(el_file.cache_file)
Ejemplo n.º 4
0
    'bpic12-0.0-0.json.gz',
    'bpic17-0.0-1.json.gz',
    'bpic17-0.0-2.json.gz'
]

np.random.seed(0)  # This will ensure reproducibility
ps = [0.3]
event_log_paths = [
    e.path for e in get_event_log_files(EVENTLOG_DIR)
    if 'bpic' in e.name and e.p == 0.0
]

combinations = list(itertools.product(event_log_paths, ps))
for event_log_path, p in tqdm(combinations, desc='Add anomalies'):
    event_log_file = EventLogFile(event_log_path)
    event_log = EventLog.from_json(event_log_path)

    anomalies = [
        ReplaceAnomaly(max_replacements=1),
        SkipSequenceAnomaly(max_sequence_size=2),
        ReworkAnomaly(max_distance=5, max_sequence_size=3),
        EarlyAnomaly(max_distance=5, max_sequence_size=2),
        LateAnomaly(max_distance=5, max_sequence_size=2),
        InsertAnomaly(max_inserts=2)
    ]

    #     if event_log.num_event_attributes > 0:
    #         anomalies.append(AttributeAnomaly(max_events=3, max_attributes=min(2, event_log.num_activities)))

    for anomaly in anomalies:
        # This is necessary to initialize the likelihood graph correctly
Ejemplo n.º 5
0
from april.generation import CategoricalAttributeGenerator
from april.generation.anomaly import *
from april.processmining.log import EventLog

xes_files = [
    'large_log.xes.gz',
    'small_log.xes.gz'
]

json_files = [
    'largelog-0.0-0.json.gz',
    'smalllog-0.0-0.json.gz'
]

for xes_file, json_file in tqdm(list(zip(xes_files, json_files))):
    event_log = EventLog.from_xes(os.path.join(BPIC_DIR, xes_file))
    event_log.save_json(os.path.join(EVENTLOG_DIR, json_file))
    

#Add anomalies

for k in range(0,10):
    np.random.seed(k) # This will ensure reproducibility
    ps = [0.3]
    event_log_paths = [e.path for e in get_event_log_files(EVENTLOG_DIR) if 'log' in e.name and e.p == 0.0]

    combinations = list(itertools.product(event_log_paths, ps))
    for event_log_path, p in tqdm(combinations, desc='Add anomalies'):
        event_log_file = EventLogFile(event_log_path)
        event_log = EventLog.from_json(event_log_path)
    def generate(self,
                 size,
                 anomalies=None,
                 anomaly_p=None,
                 anomaly_type_p=None,
                 activity_dependency_p=.5,
                 attribute_dependency_p=.5,
                 probability_variance_max=None,
                 seed=None,
                 show_progress='tqdm',
                 likelihood_graph=None):
        def random_walk(g):
            node = EventLog.start_symbol

            # Random walk until we reach the end event
            path = []
            while node != EventLog.end_symbol:
                # Skip the start node
                if node != EventLog.start_symbol:
                    path.append(node)

                # Get successors for node
                successors = list(g.successors(node))

                # Retrieve probabilities from nodes
                p = [g.edges[node, s]['probability'] for s in successors]

                # Check for and fix rounding errors
                if np.sum(p) != 0:
                    p /= np.sum(p)

                # Chose random successor based on probabilities
                node = np.random.choice(successors, p=p)

            return path

        if seed is not None:
            np.random.seed(seed)

        # Build the likelihood graph
        # TODO: Persist the likelihood graph
        if likelihood_graph is not None:
            self.likelihood_graph = likelihood_graph
        else:
            self.build_likelihood_graph(
                activity_dependency_p=activity_dependency_p,
                attribute_dependency_p=attribute_dependency_p,
                probability_variance_max=probability_variance_max,
                seed=seed)

        # Add metadata to anomalies
        activities = sorted(
            list(
                set([
                    self.likelihood_graph.nodes[node]['value']
                    for node in self.likelihood_graph
                    if self.likelihood_graph.nodes[node]['name'] == 'name'
                    and self.likelihood_graph.nodes[node]['value'] not in
                    [EventLog.start_symbol, EventLog.end_symbol]
                ])))
        none_anomaly = NoneAnomaly()
        none_anomaly.activities = activities
        none_anomaly.graph = self.likelihood_graph
        none_anomaly.attributes = self.event_attributes
        for anomaly in anomalies:
            anomaly.activities = activities
            anomaly.graph = self.likelihood_graph
            anomaly.attributes = self.event_attributes

        # Generate the event log
        if show_progress == 'tqdm':
            from tqdm import tqdm
            iter = tqdm(range(size), desc='Generate event log')
        elif show_progress == 'tqdm_notebook':
            from tqdm import tqdm_notebook
            iter = tqdm_notebook(range(size), desc='Generate event log')
        else:
            iter = range(size)

        # Apply anomalies and add case id
        cases = []
        for case_id, path in enumerate(
            [random_walk(self.likelihood_graph) for _ in iter], start=1):
            if np.random.uniform(0, 1) <= anomaly_p:
                anomaly = np.random.choice(anomalies, p=anomaly_type_p)
            else:
                anomaly = none_anomaly
            case = anomaly.apply_to_path(path)
            case.id = case_id
            cases.append(case)

        event_log = EventLog(cases=cases)

        event_log.attributes['generation_parameters'] = dict(
            size=size,
            attributes=[a.json for a in self.event_attributes],
            anomalies=[a.json for a in anomalies],
            anomaly_p=anomaly_p,
            anomaly_type_p=anomaly_type_p,
            activity_dependency_p=activity_dependency_p,
            attribute_dependency_p=attribute_dependency_p,
            probability_variance_max=probability_variance_max,
            seed=int(seed))

        return event_log