Esempio n. 1
0
 def test_importExportCSVtoCSV(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     df = pd.read_csv(os.path.join(INPUT_DATA_DIR, "running-example.csv"))
     df = dataframe_utils.convert_timestamp_columns_in_df(df)
     event_log = log_conversion.apply(
         df, variant=log_conversion.TO_EVENT_STREAM)
     event_log = sorting.sort_timestamp(event_log)
     event_log = sampling.sample(event_log)
     event_log = index_attribute.insert_event_index_as_event_attribute(
         event_log)
     log = log_conversion.apply(event_log)
     log = sorting.sort_timestamp(log)
     log = sampling.sample(log)
     log = index_attribute.insert_trace_index_as_event_attribute(log)
     event_log_transformed = log_conversion.apply(
         log, variant=log_conversion.TO_EVENT_STREAM)
     df = log_conversion.apply(event_log_transformed,
                               variant=log_conversion.TO_DATA_FRAME)
     df.to_csv(os.path.join(OUTPUT_DATA_DIR,
                            "running-example-exported.csv"))
     df = pd.read_csv(
         os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv"))
     df = dataframe_utils.convert_timestamp_columns_in_df(df)
     event_log_imported_after_export = log_conversion.apply(
         df, variant=log_conversion.TO_EVENT_STREAM)
     log_imported_after_export = log_conversion.apply(
         event_log_imported_after_export)
     self.assertEqual(len(log), len(log_imported_after_export))
     os.remove(os.path.join(OUTPUT_DATA_DIR,
                            "running-example-exported.csv"))
Esempio n. 2
0
 def test_applyAlphaMinerToCSV(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     # calculate and compare Petri nets obtained on the same log to verify that instances
     # are working correctly
     log1, net1, marking1, fmarking1 = self.obtainPetriNetThroughAlphaMiner(
         os.path.join(INPUT_DATA_DIR, "running-example.csv"))
     log2, net2, marking2, fmarking2 = self.obtainPetriNetThroughAlphaMiner(
         os.path.join(INPUT_DATA_DIR, "running-example.csv"))
     log1 = sorting.sort_timestamp(log1)
     log1 = sampling.sample(log1)
     log1 = index_attribute.insert_trace_index_as_event_attribute(log1)
     log2 = sorting.sort_timestamp(log2)
     log2 = sampling.sample(log2)
     log2 = index_attribute.insert_trace_index_as_event_attribute(log2)
     petri_exporter.export_net(
         net1, marking1,
         os.path.join(OUTPUT_DATA_DIR, "running-example.pnml"))
     os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example.pnml"))
     self.assertEqual(len(net1.places), len(net2.places))
     self.assertEqual(len(net1.transitions), len(net2.transitions))
     self.assertEqual(len(net1.arcs), len(net2.arcs))
     final_marking = petri.petrinet.Marking()
     for p in net1.places:
         if not p.out_arcs:
             final_marking[p] = 1
     aligned_traces = token_replay.apply_log(log1, net1, marking1,
                                             final_marking)
     self.assertEqual(aligned_traces, aligned_traces)
 def test_importExportCSVtoCSV(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     event_log = csv_importer.import_event_stream(
         os.path.join(INPUT_DATA_DIR, "running-example.csv"))
     event_log = sorting.sort_timestamp(event_log)
     event_log = sampling.sample(event_log)
     event_log = index_attribute.insert_event_index_as_event_attribute(
         event_log)
     log = log_conv_fact.apply(event_log)
     log = sorting.sort_timestamp(log)
     log = sampling.sample(log)
     log = index_attribute.insert_trace_index_as_event_attribute(log)
     event_log_transformed = log_conv_fact.apply(
         log, variant=log_conv_fact.TO_EVENT_STREAM)
     csv_exporter.export(
         event_log_transformed,
         os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv"))
     event_log_imported_after_export = csv_importer.import_event_stream(
         os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv"))
     log_imported_after_export = log_conv_fact.apply(
         event_log_imported_after_export)
     self.assertEqual(len(log), len(log_imported_after_export))
     os.remove(os.path.join(OUTPUT_DATA_DIR,
                            "running-example-exported.csv"))
Esempio n. 4
0
def __dotted_attribute_selection(log: Union[EventLog, pd.DataFrame],
                                 attributes):
    """
    Default attribute selection for the dotted chart

    Parameters
    -----------------
    log
        Event log

    Returns
    -----------------
    attributes
        List of attributes
    """
    if type(log) not in [pd.DataFrame, EventLog, EventStream]:
        raise Exception(
            "the method can be applied only to a traditional event log!")

    if attributes is None:
        from pm4py.util import xes_constants
        from pm4py.objects.log.util import sorting
        from pm4py.objects.conversion.log import converter
        log = converter.apply(log, variant=converter.Variants.TO_EVENT_LOG)
        log = sorting.sort_timestamp(log, xes_constants.DEFAULT_TIMESTAMP_KEY)
        for index, trace in enumerate(log):
            trace.attributes["@@index"] = index
        attributes = ["time:timestamp", "case:@@index", "concept:name"]
    return log, attributes
def apply(con,
          ref_type="Invoice",
          keep_first=True,
          min_extr_date="2020-01-01 00:00:00",
          gjahr="2020",
          enable_changes=True,
          enable_payments=True,
          allowed_act_doc_types=None,
          allowed_act_changes=None,
          mandt="800"):
    dataframe = o2c_1d_dataframe_extractor.apply(
        con,
        ref_type=ref_type,
        keep_first=keep_first,
        min_extr_date=min_extr_date,
        gjahr=gjahr,
        enable_changes=enable_changes,
        enable_payments=enable_payments,
        allowed_act_doc_types=allowed_act_doc_types,
        allowed_act_changes=allowed_act_changes,
        mandt=mandt)
    log = log_converter.apply(dataframe,
                              parameters={"stream_postprocessing": True})
    log = sorting.sort_timestamp(log, "time:timestamp")
    return log
 def test_importExportCSVtoXES(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     event_log = csv_importer.import_event_stream(os.path.join(INPUT_DATA_DIR, "running-example.csv"))
     event_log = sorting.sort_timestamp(event_log)
     event_log = sampling.sample(event_log)
     event_log = index_attribute.insert_event_index_as_event_attribute(event_log)
     log = log_transform.transform_event_stream_to_event_log(event_log)
     log = sorting.sort_timestamp(log)
     log = sampling.sample(log)
     log = index_attribute.insert_trace_index_as_event_attribute(log)
     xes_exporter.export_log(log, os.path.join(OUTPUT_DATA_DIR, "running-example-exported.xes"))
     log_imported_after_export = xes_importer.import_log(
         os.path.join(OUTPUT_DATA_DIR, "running-example-exported.xes"))
     self.assertEqual(len(log), len(log_imported_after_export))
     os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example-exported.xes"))
Esempio n. 7
0
    def testCSVConversion(self):
        dirname = os.path.dirname(__file__)
        rootDir = os.path.join(dirname, 'TestFiles', 'ImportExport')
        csvPath = os.path.join(dirname, 'Ressources/example.csv')

        fileCreator = FileUtility(rootDir)
        eventLog = fileCreator.getEventLogFromFile(csvPath)
        eventLog = sorting.sort_timestamp(eventLog)
        self.assertEqual(len(eventLog), 6)
def apply(con, gjahr="1997", mandt="800", bukrs="1000", **ext_arg):
    dataframe = single_doc_transactions_dataframe.apply(con,
                                                        gjahr=gjahr,
                                                        bukrs=bukrs,
                                                        mandt=mandt)
    log = log_converter.apply(dataframe,
                              parameters={"stream_postprocessing": True})
    log = sorting.sort_timestamp(log, "time:timestamp")
    return log
Esempio n. 9
0
def execute_script():
    log = xes_importer.apply(os.path.join("..", "tests", "input_data", "receipt.xes"))
    log = sorting.sort_timestamp(log)
    net, im, fm = inductive_miner.apply(log)
    log1 = EventLog(log[:500])
    log2 = EventLog(log[len(log) - 500:])
    statistics = element_usage_comparison.compare_element_usage_two_logs(net, im, fm, log1, log2)
    gviz = pn_vis.apply(net, im, fm, variant=pn_vis.Variants.FREQUENCY, aggregated_statistics=statistics,
                        parameters={pn_vis.Variants.FREQUENCY.value.Parameters.FORMAT: "svg"})
    pn_vis.view(gviz)
Esempio n. 10
0
def uselog(loginput):
    log = xes_import_factory.apply(loginput)
    log = sorting.sort_timestamp(log)
    # print(log)
    dfg = dfg_factory.apply(log)
    dfg_gv = dfg_vis_fact.apply(dfg, log, parameters={"format": "svg"})
    this_data = dfg_to_g6.dfg_to_g6(dfg)

    # dfg_vis_fact.view(dfg_gv)
    return this_data
    '''grouplist = get_groups(log)
Esempio n. 11
0
 def get_log_obj_type(self, objtype):
     columns = [x for x in self.exploded_dataframe.columns if x.startswith("event_")] + [objtype]
     dataframe = self.exploded_dataframe[columns].dropna(how="any", subset=[objtype])
     dataframe = succint_mdl_to_exploded_mdl.apply(dataframe)
     dataframe = dataframe.rename(columns={"event_activity": "concept:name", "event_timestamp": "time:timestamp",
                                           objtype: "case:concept:name"})
     stream = EventStream(dataframe.to_dict('r'))
     log = log_conv_factory.apply(stream)
     log = sorting.sort_timestamp(log, "time:timestamp")
     exported_log = base64.b64encode(xes_exporter.export_log_as_string(log)).decode("utf-8")
     return self.name + "_" + objtype, "xes", exported_log
def apply(con,
          gjahr="1997",
          mandt="800",
          bukrs="1000",
          ref_type="Goods receipt"):
    dataframe = doc_flow_transactions_dataframe.apply(con,
                                                      gjahr=gjahr,
                                                      bukrs=bukrs,
                                                      mandt=mandt,
                                                      ref_type=ref_type)
    log = log_converter.apply(dataframe,
                              parameters={"stream_postprocessing": True})
    log = sorting.sort_timestamp(log, "time:timestamp")
    return log
Esempio n. 13
0
def apply(log: EventLog, parameters: Optional[Dict[str, Any]] = None) -> Tuple[List[datetime], np.ndarray]:
    """
    Analyse the evolution of the features over the time using a locally linear embedding.

    Parameters
    -----------------
    log
        Event log
    parameters
        Variant-specific parameters, including:
        - Parameters.ACTIVITY_KEY => the activity key
        - Parameters.TIMESTAMP_KEY => the timestamp key
        - Parameters.CASE_ID_KEY => the case ID key

    Returns
    ----------------
    x
        Date attributes (starting points of the cases)
    y
        Deviation from the standard behavior (higher absolute values of y signal a higher deviation
        from the standard behavior)
    """
    if parameters is None:
        parameters = {}

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
                                               xes_constants.DEFAULT_TIMESTAMP_KEY)

    if type(log) is pd.DataFrame:
        # keep only the needed columns
        case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
        log = log[[case_id_key, activity_key, timestamp_key]]

    log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters)
    log = sorting.sort_timestamp(log, timestamp_key)

    x = [trace[0][timestamp_key] for trace in log]
    data, feature_names = log_to_features.apply(log, parameters={"str_ev_attr": [activity_key], "str_evsucc_attr": [activity_key]})

    tsne = LocallyLinearEmbedding(n_components=1)
    data = tsne.fit_transform(data)
    data = np.ndarray.flatten(data)

    y = data
    smooth_amount = 1 + math.floor(math.sqrt(len(y)))
    y = smooth(y, smooth_amount)

    return x, y
Esempio n. 14
0
def execute_script():
    log = xes_importer.apply(
        os.path.join("..", "tests", "input_data", "receipt.xes"))
    log = sorting.sort_timestamp(log)
    net, im, fm = inductive_miner.apply(log)
    log1 = EventLog(log[:500])
    log2 = EventLog(log[len(log) - 500:])
    statistics = element_usage_comparison.compare_element_usage_two_logs(
        net, im, fm, log1, log2)
    gviz = pn_vis_factory.apply(net,
                                im,
                                fm,
                                variant="frequency",
                                aggregated_statistics=statistics,
                                parameters={"format": "svg"})
    pn_vis_factory.view(gviz)
Esempio n. 15
0
def apply(con,
          ref_type="EKKO",
          gjahr="2014",
          min_extr_date="2014-01-01 00:00:00",
          mandt="800",
          bukrs="1000",
          extra_els_query=None):
    dataframe = p2p_1d_dataframe.apply(con,
                                       gjahr=gjahr,
                                       ref_type=ref_type,
                                       min_extr_date=min_extr_date,
                                       mandt=mandt,
                                       bukrs=bukrs,
                                       extra_els_query=extra_els_query)
    log = log_converter.apply(dataframe,
                              parameters={"stream_postprocessing": True})
    print("converted dataframe")
    log = sorting.sort_timestamp(log, "time:timestamp")
    return log
Esempio n. 16
0
 def test_alphaMinerVisualizationFromXES(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     log, net, marking, fmarking = self.obtainPetriNetThroughAlphaMiner(
         os.path.join(INPUT_DATA_DIR, "running-example.xes"))
     log = sorting.sort_timestamp(log)
     log = sampling.sample(log)
     log = index_attribute.insert_trace_index_as_event_attribute(log)
     petri_exporter.apply(net, marking, os.path.join(OUTPUT_DATA_DIR, "running-example.pnml"))
     os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example.pnml"))
     gviz = pn_viz.graphviz_visualization(net)
     self.assertEqual(gviz, gviz)
     final_marking = petri.petrinet.Marking()
     for p in net.places:
         if not p.out_arcs:
             final_marking[p] = 1
     aligned_traces = token_replay.apply(log, net, marking, fmarking)
     self.assertEqual(aligned_traces, aligned_traces)
Esempio n. 17
0
def __dotted_attribute_selection(log, attributes):
    """
    Default attribute selection for the dotted chart

    Parameters
    -----------------
    log
        Event log

    Returns
    -----------------
    attributes
        List of attributes
    """
    if attributes is None:
        from pm4py.util import xes_constants
        from pm4py.objects.log.util import sorting
        from pm4py.convert import convert_to_event_log
        log = convert_to_event_log(log)
        log = sorting.sort_timestamp(log, xes_constants.DEFAULT_TIMESTAMP_KEY)
        for index, trace in enumerate(log):
            trace.attributes["@@index"] = index
        attributes = ["time:timestamp", "case:@@index", "concept:name"]
    return log, attributes
Esempio n. 18
0
def import_log_from_file_object(f,
                                encoding,
                                file_size=sys.maxsize,
                                parameters=None):
    """
    Import a log object from a (XML) file object

    Parameters
    -----------
    f
        file object
    encoding
        Encoding
    file_size
        Size of the file (measured on disk)
    parameters
        Parameters of the algorithm, including
            Parameters.TIMESTAMP_SORT -> Specify if we should sort log by timestamp
            Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log by using this key
            Parameters.REVERSE_SORT -> Specify in which direction the log should be sorted
            Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log (read in order in the XML file)
            Parameters.MAX_BYTES -> Maximum number of bytes to read
            Parameters.SKYP_BYTES -> Number of bytes to skip
            Parameters.SET_ATTRIBUTES_TO_READ -> Names of the attributes that should be parsed. If not specified,
                                                then, all the attributes are parsed.

    Returns
    -----------
    log
        Log file
    """
    values_dict = {}
    date_parser = dt_parser.get()

    set_attributes_to_read = exec_utils.get_param_value(
        Parameters.SET_ATTRIBUTES_TO_READ, parameters, None)
    max_no_traces_to_import = exec_utils.get_param_value(
        Parameters.MAX_TRACES, parameters, sys.maxsize)
    timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT,
                                                parameters, False)
    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT,
                                              parameters, False)

    skip_bytes = exec_utils.get_param_value(Parameters.SKIP_BYTES, parameters,
                                            False)
    max_bytes_to_read = exec_utils.get_param_value(Parameters.MAX_BYTES,
                                                   parameters, sys.maxsize)

    if file_size > max_bytes_to_read:
        skip_bytes = file_size - max_bytes_to_read

    log = EventLog()
    tracecount = 0
    trace = None
    event = None

    f.seek(skip_bytes)

    for line in f:
        content = line.decode(encoding).split("\"")
        if len(content) > 0:
            tag = content[0].split("<")[-1]
            if trace is not None:
                if event is not None:
                    if len(content) == 5:
                        key, value = read_attribute_key_value(
                            tag, content, date_parser, values_dict,
                            set_attributes_to_read)
                        if value is not None:
                            event[key] = value
                    elif tag.startswith("/event"):
                        trace.append(event)
                        event = None
                elif tag.startswith("event"):
                    event = Event()
                elif len(content) == 5:
                    key, value = read_attribute_key_value(
                        tag, content, date_parser, values_dict,
                        set_attributes_to_read)
                    if value is not None:
                        trace.attributes[key] = value
                elif tag.startswith("/trace"):
                    log.append(trace)
                    tracecount += 1
                    if tracecount > max_no_traces_to_import:
                        break
                    trace = None
            elif tag.startswith("trace"):
                trace = Trace()

    if timestamp_sort:
        log = sorting.sort_timestamp(log,
                                     timestamp_key=timestamp_key,
                                     reverse_sort=reverse_sort)

    # sets the activity key as default classifier in the log's properties
    log.properties[
        constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_constants.DEFAULT_NAME_KEY
    log.properties[
        constants.
        PARAMETER_CONSTANT_ATTRIBUTE_KEY] = xes_constants.DEFAULT_NAME_KEY
    # sets the default timestamp key
    log.properties[
        constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_constants.DEFAULT_TIMESTAMP_KEY
    # sets the default resource key
    log.properties[
        constants.
        PARAMETER_CONSTANT_RESOURCE_KEY] = xes_constants.DEFAULT_RESOURCE_KEY
    # sets the default transition key
    log.properties[
        constants.
        PARAMETER_CONSTANT_TRANSITION_KEY] = xes_constants.DEFAULT_TRANSITION_KEY
    # sets the default group key
    log.properties[
        constants.
        PARAMETER_CONSTANT_GROUP_KEY] = xes_constants.DEFAULT_GROUP_KEY

    return log
Esempio n. 19
0
def train(log, parameters=None):
    """
    Train the prediction model

    Parameters
    -----------
    log
        Event log
    parameters
        Possible parameters of the algorithm

    Returns
    ------------
    model
        Trained model
    """
    if parameters is None:
        parameters = {}

    parameters["enable_sort"] = False
    activity_key = parameters[
        constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    timestamp_key = parameters[
        constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY
    business_hours = parameters[
        "business_hours"] if "business_hours" in parameters else False
    worktiming = parameters["worktiming"] if "worktiming" in parameters else [
        7, 17
    ]
    weekends = parameters["weekends"] if "weekends" in parameters else [6, 7]

    y_orig = parameters["y_orig"] if "y_orig" in parameters else None

    log = sorting.sort_timestamp(log, timestamp_key)

    str_evsucc_attr = [activity_key]
    if "str_ev_attr" in parameters:
        str_tr_attr = parameters[
            "str_tr_attr"] if "str_tr_attr" in parameters else []
        str_ev_attr = parameters[
            "str_ev_attr"] if "str_ev_attr" in parameters else []
        num_tr_attr = parameters[
            "num_tr_attr"] if "num_tr_attr" in parameters else []
        num_ev_attr = parameters[
            "num_ev_attr"] if "num_ev_attr" in parameters else []
    else:
        str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr = attributes_filter.select_attributes_from_log_for_tree(
            log)
        if activity_key not in str_ev_attr:
            str_ev_attr.append(activity_key)

    max_trace_length = max(len(x) for x in log)

    if max_trace_length == 1:
        # this you shall use
        data, feature_names = get_log_representation.get_representation(
            log,
            str_tr_attr,
            str_ev_attr,
            num_tr_attr,
            num_ev_attr,
            str_evsucc_attr=str_evsucc_attr)
        ext_log = log
    else:
        ext_log, change_indexes = get_log_with_log_prefixes(log)
        data, feature_names = get_log_representation.get_representation(
            ext_log,
            str_tr_attr,
            str_ev_attr,
            num_tr_attr,
            num_ev_attr,
            str_evsucc_attr=str_evsucc_attr)

    if y_orig is not None:
        remaining_time = [y for x in y_orig for y in x]
    else:
        if business_hours:
            remaining_time = []
            for trace in ext_log:
                if trace:
                    timestamp_et = trace[-1][timestamp_key]
                    timestamp_st = trace[0][timestamp_key]

                    bh = BusinessHours(timestamp_st.replace(tzinfo=None),
                                       timestamp_et.replace(tzinfo=None),
                                       worktiming=worktiming,
                                       weekends=weekends)
                    remaining_time.append(bh.getseconds())
                else:
                    remaining_time.append(0)
        else:
            remaining_time = []
            for trace in ext_log:
                if trace:
                    remaining_time.append(
                        (trace[-1][timestamp_key] -
                         trace[0][timestamp_key]).total_seconds())
                else:
                    remaining_time.append(0)
    regr = ElasticNet(max_iter=10000, l1_ratio=0.7)
    print(data)
    regr.fit(data, remaining_time)

    return {
        "str_tr_attr": str_tr_attr,
        "str_ev_attr": str_ev_attr,
        "num_tr_attr": num_tr_attr,
        "num_ev_attr": num_ev_attr,
        "str_evsucc_attr": str_evsucc_attr,
        "feature_names": feature_names,
        "remaining_time": remaining_time,
        "regr": regr,
        "variant": "elasticnet"
    }
Esempio n. 20
0
def import_log(filename, parameters=None):
    """
    Import a log object from a XML file
    containing the traces, the events and the simple attributes of them

    Parameters
    -----------
    filename
        XES file to parse
    parameters
        Parameters of the algorithm, including
            Parameters.TIMESTAMP_SORT -> Specify if we should sort log by timestamp
            Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log by using this key
            Parameters.REVERSE_SORT -> Specify in which direction the log should be sorted
            Parameters.INSERT_TRACE_INDICES -> Specify if trace indexes should be added as event attribute for each event
            Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log (read in order in the XML file)
            Parameters.MAX_BYTES -> Maximum number of bytes to read
            Parameters.SKYP_BYTES -> Number of bytes to skip


    Returns
    -----------
    xes
        XES file
    """
    if parameters is None:
        parameters = {}

    date_parser = dt_parser.get()
    timestamp_sort = param_util.fetch(Parameters.TIMESTAMP_SORT, parameters)
    timestamp_key = param_util.fetch(Parameters.TIMESTAMP_KEY, parameters)
    reverse_sort = param_util.fetch(Parameters.REVERSE_SORT, parameters)
    insert_trace_indexes = param_util.fetch(Parameters.INSERT_TRACE_INDICES, parameters)
    max_no_traces_to_import = param_util.fetch(Parameters.MAX_TRACES, parameters)
    skip_bytes = param_util.fetch(Parameters.SKYP_BYTES, parameters)
    max_bytes_to_read = param_util.fetch(Parameters.MAX_BYTES, parameters)

    file_size = os.stat(filename).st_size

    if file_size > max_bytes_to_read:
        skip_bytes = file_size - max_bytes_to_read

    log = EventLog()
    tracecount = 0
    trace = None
    event = None

    f = open(filename, "r")
    f.seek(skip_bytes)

    for line in f:
        content = line.split("\"")
        if len(content) > 0:
            tag = content[0].split("<")[-1]
            if trace is not None:
                if event is not None:
                    if len(content) == 5:
                        if tag.startswith("string"):
                            event[content[1]] = content[3]
                        elif tag.startswith("date"):
                            event[content[1]] = date_parser.apply(content[3])
                        elif tag.startswith("int"):
                            event[content[1]] = int(content[3])
                        elif tag.startswith("float"):
                            event[content[1]] = float(content[3])
                        else:
                            event[content[1]] = content[3]
                    elif tag.startswith("/event"):
                        trace.append(event)
                        event = None
                elif tag.startswith("event"):
                    event = Event()
                elif len(content) == 5:
                    if tag.startswith("string"):
                        trace.attributes[content[1]] = content[3]
                    elif tag.startswith("date"):
                        trace.attributes[content[1]] = date_parser.apply(content[3])
                    elif tag.startswith("int"):
                        trace.attributes[content[1]] = int(content[3])
                    elif tag.startswith("float"):
                        trace.attributes[content[1]] = float(content[3])
                    else:
                        trace.attributes[content[1]] = content[3]
                elif tag.startswith("/trace"):
                    log.append(trace)
                    tracecount += 1
                    if tracecount > max_no_traces_to_import:
                        break
                    trace = None
            elif tag.startswith("trace"):
                trace = Trace()
    f.close()

    if timestamp_sort:
        log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort)
    if insert_trace_indexes:
        log.insert_trace_index_as_event_attribute()

    return log
def import_log(filename, parameters=None):
    """
    Import a log object from a XML file
    containing the traces, the events and the simple attributes of them

    Parameters
    -----------
    filename
        XES file to parse
    parameters
        Parameters of the algorithm, including
            timestamp_sort -> Specify if we should sort log by timestamp
            timestamp_key -> If sort is enabled, then sort the log by using this key
            reverse_sort -> Specify in which direction the log should be sorted
            index_trace_indexes -> Specify if trace indexes should be added as event attribute for each event
            max_no_traces_to_import -> Specify the maximum number of traces to import from the log
            (read in order in the XML file)

    Returns
    -----------
    xes
        XES file
    """
    if parameters is None:
        parameters = {}

    timestamp_sort = False
    timestamp_key = "time:timestamp"
    reverse_sort = False
    insert_trace_indexes = False
    max_no_traces_to_import = 1000000000
    skip_bytes = 0
    max_bytes_to_read = 100000000000

    if "timestamp_sort" in parameters:
        timestamp_sort = parameters["timestamp_sort"]
    if "timestamp_key" in parameters:
        timestamp_key = parameters["timestamp_key"]
    if "reverse_sort" in parameters:
        reverse_sort = parameters["reverse_sort"]
    if "insert_trace_indexes" in parameters:
        insert_trace_indexes = parameters["insert_trace_indexes"]
    if "max_no_traces_to_import" in parameters:
        max_no_traces_to_import = parameters["max_no_traces_to_import"]
    if "max_bytes_to_read" in parameters:
        max_bytes_to_read = parameters["max_bytes_to_read"]

    file_size = os.stat(filename).st_size

    if file_size > max_bytes_to_read:
        skip_bytes = file_size - max_bytes_to_read

    log = log_lib.log.EventLog()
    tracecount = 0
    trace = None
    event = None

    f = open(filename, "r")
    f.seek(skip_bytes)

    for line in f:
        content = line.split("\"")
        if len(content) > 0:
            tag = content[0].split("<")[-1]
            if trace is not None:
                if event is not None:
                    if len(content) == 5:
                        if tag.startswith("string"):
                            event[content[1]] = content[3]
                        elif tag.startswith("date"):
                            event[content[1]] = ciso8601.parse_datetime(
                                content[3])
                        elif tag.startswith("int"):
                            event[content[1]] = int(content[3])
                        elif tag.startswith("float"):
                            event[content[1]] = float(content[3])
                        else:
                            event[content[1]] = content[3]
                    elif tag.startswith("/event"):
                        trace.append(event)
                        event = None
                elif tag.startswith("event"):
                    event = log_lib.log.Event()
                elif len(content) == 5:
                    if tag.startswith("string"):
                        trace.attributes[content[1]] = content[3]
                    elif tag.startswith("date"):
                        trace.attributes[content[1]] = ciso8601.parse_datetime(
                            content[3])
                    elif tag.startswith("int"):
                        trace.attributes[content[1]] = int(content[3])
                    elif tag.startswith("float"):
                        trace.attributes[content[1]] = float(content[3])
                    else:
                        trace.attributes[content[1]] = content[3]
                elif tag.startswith("/trace"):
                    log.append(trace)
                    tracecount += 1
                    if tracecount > max_no_traces_to_import:
                        break
                    trace = None
            elif tag.startswith("trace"):
                trace = log_lib.log.Trace()
    f.close()

    if timestamp_sort:
        log = sorting.sort_timestamp(log,
                                     timestamp_key=timestamp_key,
                                     reverse_sort=reverse_sort)
    if insert_trace_indexes:
        log.insert_trace_index_as_event_attribute()

    return log
Esempio n. 22
0
    def calc_FCB_anonymity(self,
                           log_name1,
                           log_name2,
                           event_attributes,
                           life_cycle,
                           all_life_cycle,
                           sensitive,
                           time_accuracy,
                           n,
                           bk_length,
                           result_log_name="",
                           results_dir="",
                           from_time_days=0,
                           to_time_days=0,
                           multiprocess=True):

        log1 = xes_importer_factory.apply(log_name1)
        log2 = xes_importer_factory.apply(log_name2)
        log1 = sorting.sort_timestamp(log1)
        log2 = sorting.sort_timestamp(log2)
        utils = Utils()

        simple_log, traces, sensitive_values, df = utils.create_simple_log_adv(
            log1, event_attributes, life_cycle, all_life_cycle, sensitive,
            time_accuracy, from_time_days, to_time_days)
        # new_event_log = utils.createEventLog(log,simple_log,event_attributes,life_cycle,all_life_cycle, sensitive,time_accuracy)
        # xes_exporter.export_log(new_event_log, "EL1.xes")

        simple_log2, traces2, sensitive_values2, df2 = utils.create_simple_log_adv(
            log2, event_attributes, life_cycle, all_life_cycle, sensitive,
            time_accuracy, from_time_days, to_time_days)
        # new_event_log = utils.createEventLog(log,simple_log2,event_attributes,life_cycle,all_life_cycle, sensitive,time_accuracy)
        # xes_exporter.export_log(new_event_log, "EL2.xes")

        activities1 = utils.get_unique_act(traces)
        activities2 = utils.get_unique_act(traces2)

        uniq_activities = activities2.union(activities1)

        map_dict_act_chr, map_dict_chr_act, uniq_char = utils.map_act_char(
            uniq_activities)

        simple_log_char = utils.convert_simple_log_act_to_char(
            simple_log, map_dict_act_chr)
        df_char = utils.convert_lof_dataframe_act_to_char(df, map_dict_act_chr)

        simple_log_char2 = utils.convert_simple_log_act_to_char(
            simple_log2, map_dict_act_chr)
        df_char2 = utils.convert_lof_dataframe_act_to_char(
            df2, map_dict_act_chr)

        df_char.replace(
            np.nan, '--', inplace=True
        )  # this will consider nan values as a sensitive attribute!
        df_char2.replace(
            np.nan, '--', inplace=True
        )  # this will consider nan values as a sensitive attribute!

        # utils.add_fake_activities(uniq_char,map_dict_act_chr, map_dict_chr_act, bk_length)

        bk_candidate_iter = itertools.product(uniq_char, repeat=bk_length)
        bk_candidate = list(bk_candidate_iter)

        result_file = ""
        if results_dir != "" and result_log_name != "":
            if not os.path.exists(results_dir):
                os.makedirs(results_dir)
            file_name = "Result_" + result_log_name + "_bk_length_" + str(
                bk_length) + "_n_" + str(n) + ".xlsx"
            result_file = os.path.join(results_dir, file_name)

        columns = ['bk', 'R1-K', 'R2-K', 'FA', 'CA', 'BA']
        df_result = pd.DataFrame(columns=columns)
        FA_list = []
        BA_list = []
        CA_list = []
        R1_KA_list = []
        R2_KA_list = []
        results = []

        if multiprocess:
            pool = mp.Pool()
            workers = []
            workers_number = os.cpu_count()
            data_chunks = self.chunkIt(bk_candidate, workers_number)
            for worker in range(workers_number):
                print("In worker %d out of %d" % (worker + 1, workers_number))
                workers.append(
                    pool.apply_async(
                        self.FCB_anonymity_worker,
                        args=(data_chunks[worker], df_char, df_char2, n,
                              sensitive, map_dict_act_chr, map_dict_chr_act)))
            for work in workers:
                results.append(work.get())
            pool.close()
            pool.join()

        else:
            result = self.FCB_anonymity(bk_candidate, df_char, df_char2, n,
                                        sensitive, map_dict_act_chr,
                                        map_dict_chr_act)
            results.append(result)

        for result in results:
            for key, value in result.items():
                if key == 'df_result':
                    df_result = pd.concat([df_result, value], sort=False)
                elif key == 'FA':
                    FA_list.append(value)
                elif key == 'CA':
                    CA_list.append(value)
                elif key == 'BA':
                    BA_list.append(value)
                elif key == 'R1_KA':
                    R1_KA_list.append(value)
                elif key == 'R2_KA':
                    R2_KA_list.append(value)
        FA = min(FA_list)
        CA = min(CA_list)
        BA = min(BA_list)
        R1_KA = min(R1_KA_list)
        R2_KA = min(R2_KA_list)

        df_result_last_row = {
            'bk': "Event Log",
            'R1-K': R1_KA,
            'R2-K': R2_KA,
            'FA': FA,
            'CA': CA,
            'BA': BA
        }
        df_result = df_result.append(df_result_last_row, ignore_index=True)

        if result_file != "":
            writer = ExcelWriter(result_file)
            df_result.to_excel(writer,
                               'bk_length_' + str(bk_length) + "-n_" + str(n))
            writer.save()

        las_line = "Result for Event Log, R1-KA:%d, R2-KA:%d, FA:%d, CA:%d, BA:%d" % (
            R1_KA, R2_KA, FA, CA, BA)
        print(las_line)

        return R1_KA, R2_KA, FA, CA, BA
Esempio n. 23
0
def import_from_context(context, num_traces, parameters=None):
    """
    Import a XES log from an iterparse context

    Parameters
    --------------
    context
        Iterparse context
    num_traces
        Number of traces of the XES log
    parameters
        Parameters of the algorithm

    Returns
    --------------
    log
        Event log
    """
    if parameters is None:
        parameters = {}

    max_no_traces_to_import = exec_utils.get_param_value(Parameters.MAX_TRACES, parameters, sys.maxsize)
    timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT, parameters, False)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
                                               xes_constants.DEFAULT_TIMESTAMP_KEY)
    reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT, parameters, False)
    show_progress_bar = exec_utils.get_param_value(Parameters.SHOW_PROGRESS_BAR, parameters, True)

    date_parser = dt_parser.get()
    progress = None
    if pkgutil.find_loader("tqdm") and show_progress_bar:
        from tqdm.auto import tqdm
        progress = tqdm(total=num_traces, desc="parsing log, completed traces :: ")

    log = None
    trace = None
    event = None

    tree = {}

    for tree_event, elem in context:
        if tree_event == _EVENT_START:  # starting to read
            parent = tree[elem.getparent()] if elem.getparent() in tree else None

            if elem.tag.endswith(xes_constants.TAG_STRING):
                if parent is not None:
                    tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE), tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_DATE):
                try:
                    dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE))
                    tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), dt, tree)
                except TypeError:
                    logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE)))
                except ValueError:
                    logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_EVENT):
                if event is not None:
                    raise SyntaxError('file contains <event> in another <event> tag')
                event = Event()
                tree[elem] = event
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                if len(log) >= max_no_traces_to_import:
                    break
                if trace is not None:
                    raise SyntaxError('file contains <trace> in another <trace> tag')
                trace = Trace()
                tree[elem] = trace.attributes
                continue

            elif elem.tag.endswith(xes_constants.TAG_FLOAT):
                if parent is not None:
                    try:
                        val = float(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree)
                    except ValueError:
                        logging.info("failed to parse float: " + str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_INT):
                if parent is not None:
                    try:
                        val = int(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree)
                    except ValueError:
                        logging.info("failed to parse int: " + str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_BOOLEAN):
                if parent is not None:
                    try:
                        val0 = elem.get(xes_constants.KEY_VALUE)
                        val = False
                        if str(val0).lower() == "true":
                            val = True
                        tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree)
                    except ValueError:
                        logging.info("failed to parse boolean: " + str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_LIST):
                if parent is not None:
                    # lists have no value, hence we put None as a value
                    tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), None, tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_ID):
                if parent is not None:
                    tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE), tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_EXTENSION):
                if log is None:
                    raise SyntaxError('extension found outside of <log> tag')
                if elem.get(xes_constants.KEY_NAME) is not None and elem.get(
                        xes_constants.KEY_PREFIX) is not None and elem.get(xes_constants.KEY_URI) is not None:
                    log.extensions[elem.get(xes_constants.KEY_NAME)] = {
                        xes_constants.KEY_PREFIX: elem.get(xes_constants.KEY_PREFIX),
                        xes_constants.KEY_URI: elem.get(xes_constants.KEY_URI)}
                continue

            elif elem.tag.endswith(xes_constants.TAG_GLOBAL):
                if log is None:
                    raise SyntaxError('global found outside of <log> tag')
                if elem.get(xes_constants.KEY_SCOPE) is not None:
                    log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {}
                    tree[elem] = log.omni_present[elem.get(xes_constants.KEY_SCOPE)]
                continue

            elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER):
                if log is None:
                    raise SyntaxError('classifier found outside of <log> tag')
                if elem.get(xes_constants.KEY_KEYS) is not None:
                    classifier_value = elem.get(xes_constants.KEY_KEYS)
                    if "'" in classifier_value:
                        log.classifiers[elem.get(xes_constants.KEY_NAME)] = [x for x in classifier_value.split("'")
                                                                             if x.strip()]
                    else:
                        log.classifiers[elem.get(xes_constants.KEY_NAME)] = classifier_value.split()
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                if log is not None:
                    raise SyntaxError('file contains > 1 <log> tags')
                log = EventLog()
                tree[elem] = log.attributes
                continue

        elif tree_event == _EVENT_END:
            if elem in tree:
                del tree[elem]
            elem.clear()
            if elem.getprevious() is not None:
                try:
                    del elem.getparent()[0]
                except TypeError:
                    pass

            if elem.tag.endswith(xes_constants.TAG_EVENT):
                if trace is not None:
                    trace.append(event)
                    event = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                log.append(trace)

                if progress is not None:
                    progress.update()

                trace = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                continue

    # gracefully close progress bar
    if progress is not None:
        progress.close()
    del context, progress

    if timestamp_sort:
        log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort)

    # sets the activity key as default classifier in the log's properties
    log.properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_constants.DEFAULT_NAME_KEY
    log.properties[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = xes_constants.DEFAULT_NAME_KEY
    # sets the default timestamp key
    log.properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_constants.DEFAULT_TIMESTAMP_KEY
    # sets the default resource key
    log.properties[constants.PARAMETER_CONSTANT_RESOURCE_KEY] = xes_constants.DEFAULT_RESOURCE_KEY
    # sets the default transition key
    log.properties[constants.PARAMETER_CONSTANT_TRANSITION_KEY] = xes_constants.DEFAULT_TRANSITION_KEY
    # sets the default group key
    log.properties[constants.PARAMETER_CONSTANT_GROUP_KEY] = xes_constants.DEFAULT_GROUP_KEY

    return log
Esempio n. 24
0
def import_log(filename, parameters=None):
    """
    Imports an XES file into a log object

    Parameters
    ----------
    filename:
        Absolute filename
    parameters
        Parameters of the algorithm, including
            Parameters.TIMESTAMP_SORT -> Specify if we should sort log by timestamp
            Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log by using this key
            Parameters.REVERSE_SORT -> Specify in which direction the log should be sorted
            Parameters.INSERT_TRACE_INDICES -> Specify if trace indexes should be added as event attribute for each event
            Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log (read in order in the XML file)

    Returns
    -------
    log : :class:`pm4py.log.log.EventLog`
        A log
    """

    parameters = dict() if parameters is None else parameters

    insert_trace_indexes = param_util.fetch(Parameters.INSERT_TRACE_INDICES,
                                            parameters)
    max_no_traces_to_import = param_util.fetch(Parameters.MAX_TRACES,
                                               parameters)

    date_parser = dt_parser.get()
    context = etree.iterparse(filename, events=[_EVENT_START, _EVENT_END])

    # check to see if log has a namespace before looking for traces  (but this might be more effort than worth)
    # but you could just assume that log use on the standard namespace desbried in XES
    # to only find elements that start a trace use tag="{http://www.xes-standard.org}trace"
    # or just use the {*} syntax to match to all namespaces with a trace element

    #count number of traces and setup progress bar
    no_trace = sum([
        1 for trace in etree.iterparse(
            filename, events=[_EVENT_START], tag="{*}trace")
    ])

    # make tqdm facultative
    progress = None
    if pkgutil.find_loader("tqdm"):
        from tqdm.auto import tqdm
        progress = tqdm(total=no_trace,
                        desc="parsing log, completed traces :: ")

    log = None
    trace = None
    event = None

    tree = {}
    for tree_event, elem in context:
        if tree_event == _EVENT_START:  # starting to read
            parent = tree[
                elem.getparent()] if elem.getparent() in tree else None

            if elem.tag.endswith(xes_constants.TAG_STRING):
                if parent is not None:
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE),
                                             tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_DATE):
                try:
                    dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE))
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             dt, tree)
                except TypeError:
                    logging.info("failed to parse date: " +
                                 str(elem.get(xes_constants.KEY_VALUE)))
                except ValueError:
                    logging.info("failed to parse date: " +
                                 str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_EVENT):
                if event is not None:
                    raise SyntaxError(
                        'file contains <event> in another <event> tag')
                event = Event()
                tree[elem] = event
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                if len(log) >= max_no_traces_to_import:
                    break
                if trace is not None:
                    raise SyntaxError(
                        'file contains <trace> in another <trace> tag')
                trace = Trace()
                tree[elem] = trace.attributes
                continue

            elif elem.tag.endswith(xes_constants.TAG_FLOAT):
                if parent is not None:
                    try:
                        val = float(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree)
                    except ValueError:
                        logging.info("failed to parse float: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_INT):
                if parent is not None:
                    try:
                        val = int(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree)
                    except ValueError:
                        logging.info("failed to parse int: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_BOOLEAN):
                if parent is not None:
                    try:
                        val0 = elem.get(xes_constants.KEY_VALUE)
                        val = False
                        if str(val0).lower() == "true":
                            val = True
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree)
                    except ValueError:
                        logging.info("failed to parse boolean: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_LIST):
                if parent is not None:
                    # lists have no value, hence we put None as a value
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             None, tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_ID):
                if parent is not None:
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE),
                                             tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_EXTENSION):
                if log is None:
                    raise SyntaxError('extension found outside of <log> tag')
                if elem.get(xes_constants.KEY_NAME) is not None and elem.get(
                        xes_constants.KEY_PREFIX) is not None and elem.get(
                            xes_constants.KEY_URI) is not None:
                    log.extensions[elem.get(xes_constants.KEY_NAME)] = {
                        xes_constants.KEY_PREFIX:
                        elem.get(xes_constants.KEY_PREFIX),
                        xes_constants.KEY_URI:
                        elem.get(xes_constants.KEY_URI)
                    }
                continue

            elif elem.tag.endswith(xes_constants.TAG_GLOBAL):
                if log is None:
                    raise SyntaxError('global found outside of <log> tag')
                if elem.get(xes_constants.KEY_SCOPE) is not None:
                    log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {}
                    tree[elem] = log.omni_present[elem.get(
                        xes_constants.KEY_SCOPE)]
                continue

            elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER):
                if log is None:
                    raise SyntaxError('classifier found outside of <log> tag')
                if elem.get(xes_constants.KEY_KEYS) is not None:
                    classifier_value = elem.get(xes_constants.KEY_KEYS)
                    if "'" in classifier_value:
                        log.classifiers[elem.get(xes_constants.KEY_NAME)] = [
                            x for x in classifier_value.split("'")
                            if x.strip()
                        ]
                    else:
                        log.classifiers[elem.get(xes_constants.KEY_NAME
                                                 )] = classifier_value.split()
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                if log is not None:
                    raise SyntaxError('file contains > 1 <log> tags')
                log = EventLog()
                tree[elem] = log.attributes
                continue

        elif tree_event == _EVENT_END:
            if elem in tree:
                del tree[elem]
            elem.clear()
            if elem.getprevious() is not None:
                try:
                    del elem.getparent()[0]
                except TypeError:
                    pass

            if elem.tag.endswith(xes_constants.TAG_EVENT):
                if trace is not None:
                    trace.append(event)
                    event = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                log.append(trace)

                #update progress bar as we have a completed trace
                if progress is not None:
                    progress.update()

                trace = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                continue

    #gracefully close progress bar
    if progress is not None:
        progress.close()
    del context, progress

    if Parameters.TIMESTAMP_SORT in parameters and parameters[
            Parameters.TIMESTAMP_SORT]:
        log = sorting.sort_timestamp(
            log,
            timestamp_key=param_util.fetch(Parameters.TIMESTAMP_KEY,
                                           parameters),
            reverse_sort=param_util.fetch(Parameters.REVERSE_SORT, parameters))
    if insert_trace_indexes:
        log = index_attribute.insert_event_index_as_event_attribute(log)

    return log
Esempio n. 25
0
def import_data(directory,
                file_name,
                separator=";",
                quote=None,
                case_id="concept:name",
                activity="activity",
                time_stamp="time:timestamp",
                target="label",
                num_cases=None):
    """
    Loads data from a file and returns an XLog/pm4py log object.
    Expects xes file with standard attributes and the target variable named "event: Label".
    Expects csv file with attributes "case_id", "activity", "timestamp" and "label".
    :param directory: name of path [str].
    :param file_name: name of file [str].
    :param separator: separator for csv file [char].
    :param quote: boolean flag [bool].
    :param case_id: identifier for cases [str].
    :param activity: identifier for activities [str].
    :param time_stamp: identifier for time stamps [str].
    :param target: identifier for target [str].
    :param num_cases: boolean flag [bool].
    :return: event log [EventLog].
    """

    extension = os.path.splitext(file_name)[1]
    print(os.getcwd())
    if extension == '.csv':
        data_dir = os.path.join(directory, file_name)

        # Specify column names
        CASEID_GLUE = case_id
        ACTIVITY_KEY = activity
        TIMEST_KEY = time_stamp

        parameters = {
            constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
            constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY,
            constants.PARAMETER_CONSTANT_TIMESTAMP_KEY: TIMEST_KEY,
            'sep': separator,
            'quotechar': quote,
            'timest_columns': TIMEST_KEY
        }

        # Load pm4py event stream
        event_stream = csv_importer.import_event_stream(data_dir,
                                                        parameters=parameters)

        # Transform event stream to log object
        log = conversion_factory.apply(event_stream, parameters=parameters)

        # Sort log by time_stamp
        log = sorting.sort_timestamp(log, timestamp_key=TIMEST_KEY)

        # Rename to xes standard
        for trace in log:
            for event in trace:
                event.__setitem__("caseid", event.__getitem__(case_id))
                event.__setitem__("concept:name", event.__getitem__(activity))
                event.__setitem__("time:timestamp",
                                  event.__getitem__(time_stamp))
                event.__setitem__("label", event.__getitem__(target))

    elif extension == '.xes':
        data_dir = os.path.join(directory, file_name)
        log = xes_import_factory.apply(data_dir)
        print(log)
        for trace in log:
            for event in trace:
                trace.__setitem__("label", event.__getitem__(target))
    else:
        raise TypeError('File type not supported.')

    # Filter out cases where label is not set (i.e. is nan); limits number of cases in event log if set
    # util.apply(log)
    if num_cases is not None:
        log = log[:num_cases]
    print("Event log loaded")

    return log
Esempio n. 26
0
def average_duration_activity(
        log: EventLog,
        t1: Union[datetime, str],
        t2: Union[datetime, str],
        r: str,
        a: str,
        parameters: Optional[Dict[str, Any]] = None) -> float:
    """
    The average duration of instances of a given activity completed during a given time slot by a given resource.

    Metric RBI 4.3 in Pika, Anastasiia, et al.
    "Mining resource profiles from event logs." ACM Transactions on Management Information Systems (TMIS) 8.1 (2017): 1-30.

    Parameters
    -----------------
    log
        Event log
    t1
        Left interval
    t2
        Right interval
    r
        Resource
    a
        Activity

    Returns
    ----------------
    metric
        Value of the metric
    """
    if parameters is None:
        parameters = {}

    t1 = get_dt_from_string(t1)
    t2 = get_dt_from_string(t2)

    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    resource_key = exec_utils.get_param_value(
        Parameters.RESOURCE_KEY, parameters,
        xes_constants.DEFAULT_RESOURCE_KEY)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_constants.DEFAULT_NAME_KEY)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters, None)

    from pm4py.objects.log.util import sorting
    log = sorting.sort_timestamp(log, timestamp_key)
    from pm4py.objects.log.util import interval_lifecycle
    log = interval_lifecycle.to_interval(log, parameters=parameters)
    if start_timestamp_key is None:
        log = __insert_start_from_previous_event(log, parameters=parameters)
        start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY

    log = converter.apply(log, variant=converter.Variants.TO_EVENT_STREAM)
    log = [
        x for x in log if x[resource_key] == r and x[activity_key] == a
        and x[timestamp_key] >= t1 and x[timestamp_key] < t2
    ]

    return float(
        mean(x[timestamp_key].timestamp() - x[start_timestamp_key].timestamp()
             for x in log))
Esempio n. 27
0
def __compute_workload(
        log: EventLog,
        resource: Optional[str] = None,
        activity: Optional[str] = None,
        parameters: Optional[Dict[str, Any]] = None) -> Dict[Tuple, int]:
    """
    Computes the workload of resources/activities, corresponding to each event a number
    (number of concurring events)

    Parameters
    ---------------
    log
        event log
    resource
        (if provided) Resource on which we want to compute the workload
    activity
        (if provided) Activity on which we want to compute the workload

    Returns
    ---------------
    workload_dict
        Dictionary associating to each event the number of concurring events
    """
    if parameters is None:
        parameters = {}

    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    resource_key = exec_utils.get_param_value(
        Parameters.RESOURCE_KEY, parameters,
        xes_constants.DEFAULT_RESOURCE_KEY)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_constants.DEFAULT_NAME_KEY)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters, None)

    from pm4py.objects.log.util import sorting
    log = sorting.sort_timestamp(log, timestamp_key)
    from pm4py.objects.log.util import interval_lifecycle
    log = interval_lifecycle.to_interval(log, parameters=parameters)
    if start_timestamp_key is None:
        log = __insert_start_from_previous_event(log, parameters=parameters)
        start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY
    events = converter.apply(log, variant=converter.Variants.TO_EVENT_STREAM)
    if resource is not None:
        events = [x for x in events if x[resource_key] == resource]
    if activity is not None:
        events = [x for x in events if x[activity_key] == activity]
    events = [(x[start_timestamp_key].timestamp(),
               x[timestamp_key].timestamp(), x[resource_key], x[activity_key])
              for x in events]
    events = sorted(events)
    from intervaltree import IntervalTree, Interval
    tree = IntervalTree()
    ev_map = {}
    k = 0.000001
    for ev in events:
        tree.add(Interval(ev[0], ev[1] + k))
    for ev in events:
        ev_map[ev] = len(tree[ev[0]:ev[1] + k])
    return ev_map
Esempio n. 28
0
def import_log(filename, parameters=None):
    """
    Imports an XES file into a log_skeleton object

    Parameters
    ----------
    filename:
        Absolute filename
    parameters
        Parameters of the algorithm, including
            Parameters.TIMESTAMP_SORT -> Specify if we should sort log_skeleton by timestamp
            Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log_skeleton by using this key
            Parameters.REVERSE_SORT -> Specify in which direction the log_skeleton should be sorted
            Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log_skeleton (read in order in the XML file)

    Returns
    -------
    log_skeleton : :class:`pm4py.log_skeleton.log_skeleton.EventLog`
        A log_skeleton
    """
    from lxml import etree

    if parameters is None:
        parameters = {}

    max_no_traces_to_import = exec_utils.get_param_value(
        Parameters.MAX_TRACES, parameters, sys.maxsize)
    timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT,
                                                parameters, False)
    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT,
                                              parameters, False)

    date_parser = dt_parser.get()

    # count number of traces and setup progress bar
    no_trace = count_traces(filename)

    context = etree.iterparse(filename, events=[_EVENT_START, _EVENT_END])

    # make tqdm facultative
    progress = None
    if pkgutil.find_loader("tqdm"):
        from tqdm.auto import tqdm
        progress = tqdm(total=no_trace,
                        desc="parsing log_skeleton, completed traces :: ")

    log = None
    trace = None
    event = None

    tree = {}
    for tree_event, elem in context:
        if tree_event == _EVENT_START:  # starting to read
            parent = tree[
                elem.getparent()] if elem.getparent() in tree else None

            if elem.tag.endswith(xes_constants.TAG_STRING):
                if parent is not None:
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE),
                                             tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_DATE):
                try:
                    dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE))
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             dt, tree)
                except TypeError:
                    logging.info("failed to parse date: " +
                                 str(elem.get(xes_constants.KEY_VALUE)))
                except ValueError:
                    logging.info("failed to parse date: " +
                                 str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_EVENT):
                if event is not None:
                    raise SyntaxError(
                        'file contains <event> in another <event> tag')
                event = Event()
                tree[elem] = event
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                if len(log) >= max_no_traces_to_import:
                    break
                if trace is not None:
                    raise SyntaxError(
                        'file contains <trace> in another <trace> tag')
                trace = Trace()
                tree[elem] = trace.attributes
                continue

            elif elem.tag.endswith(xes_constants.TAG_FLOAT):
                if parent is not None:
                    try:
                        val = float(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree)
                    except ValueError:
                        logging.info("failed to parse float: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_INT):
                if parent is not None:
                    try:
                        val = int(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree)
                    except ValueError:
                        logging.info("failed to parse int: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_BOOLEAN):
                if parent is not None:
                    try:
                        val0 = elem.get(xes_constants.KEY_VALUE)
                        val = False
                        if str(val0).lower() == "true":
                            val = True
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree)
                    except ValueError:
                        logging.info("failed to parse boolean: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_LIST):
                if parent is not None:
                    # lists have no value, hence we put None as a value
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             None, tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_ID):
                if parent is not None:
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE),
                                             tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_EXTENSION):
                if log is None:
                    raise SyntaxError(
                        'extension found outside of <log_skeleton> tag')
                if elem.get(xes_constants.KEY_NAME) is not None and elem.get(
                        xes_constants.KEY_PREFIX) is not None and elem.get(
                            xes_constants.KEY_URI) is not None:
                    log.extensions[elem.get(xes_constants.KEY_NAME)] = {
                        xes_constants.KEY_PREFIX:
                        elem.get(xes_constants.KEY_PREFIX),
                        xes_constants.KEY_URI:
                        elem.get(xes_constants.KEY_URI)
                    }
                continue

            elif elem.tag.endswith(xes_constants.TAG_GLOBAL):
                if log is None:
                    raise SyntaxError(
                        'global found outside of <log_skeleton> tag')
                if elem.get(xes_constants.KEY_SCOPE) is not None:
                    log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {}
                    tree[elem] = log.omni_present[elem.get(
                        xes_constants.KEY_SCOPE)]
                continue

            elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER):
                if log is None:
                    raise SyntaxError(
                        'classifier found outside of <log_skeleton> tag')
                if elem.get(xes_constants.KEY_KEYS) is not None:
                    classifier_value = elem.get(xes_constants.KEY_KEYS)
                    if "'" in classifier_value:
                        log.classifiers[elem.get(xes_constants.KEY_NAME)] = [
                            x for x in classifier_value.split("'")
                            if x.strip()
                        ]
                    else:
                        log.classifiers[elem.get(xes_constants.KEY_NAME
                                                 )] = classifier_value.split()
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                if log is not None:
                    raise SyntaxError('file contains > 1 <log_skeleton> tags')
                log = EventLog()
                tree[elem] = log.attributes
                continue

        elif tree_event == _EVENT_END:
            if elem in tree:
                del tree[elem]
            elem.clear()
            if elem.getprevious() is not None:
                try:
                    del elem.getparent()[0]
                except TypeError:
                    pass

            if elem.tag.endswith(xes_constants.TAG_EVENT):
                if trace is not None:
                    trace.append(event)
                    event = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                log.append(trace)

                # update progress bar as we have a completed trace
                if progress is not None:
                    progress.update()

                trace = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                continue

    # gracefully close progress bar
    if progress is not None:
        progress.close()
    del context, progress

    if timestamp_sort:
        log = sorting.sort_timestamp(log,
                                     timestamp_key=timestamp_key,
                                     reverse_sort=reverse_sort)

    return log
Esempio n. 29
0
def import_log(filename, parameters=None):
    """
    Imports an XES file into a log object

    Parameters
    ----------
    filename:
        Absolute filename
    parameters
        Parameters of the algorithm, including
            timestamp_sort -> Specify if we should sort log by timestamp
            timestamp_key -> If sort is enabled, then sort the log by using this key
            reverse_sort -> Specify in which direction the log should be sorted
            index_trace_indexes -> Specify if trace indexes should be added as event attribute for each event
            max_no_traces_to_import -> Specify the maximum number of traces to import from the log
            (read in order in the XML file)

    Returns
    -------
    log : :class:`pm4py.log.log.EventLog`
        A log
    """

    if parameters is None:
        parameters = {}

    timestamp_sort = False
    timestamp_key = "time:timestamp"
    reverse_sort = False
    insert_trace_indexes = False
    max_no_traces_to_import = 1000000000

    if "timestamp_sort" in parameters:
        timestamp_sort = parameters["timestamp_sort"]
    if "timestamp_key" in parameters:
        timestamp_key = parameters["timestamp_key"]
    if "reverse_sort" in parameters:
        reverse_sort = parameters["reverse_sort"]
    if "insert_trace_indexes" in parameters:
        insert_trace_indexes = parameters["insert_trace_indexes"]
    if "max_no_traces_to_import" in parameters:
        max_no_traces_to_import = parameters["max_no_traces_to_import"]

    context = etree.iterparse(filename, events=['start', 'end'])

    log = None
    trace = None
    event = None

    tree = {}

    for tree_event, elem in context:
        if tree_event == EVENT_START:  # starting to read
            parent = tree[
                elem.getparent()] if elem.getparent() in tree else None

            if elem.tag.endswith(log_lib.util.xes.TAG_STRING):
                if parent is not None:
                    tree = __parse_attribute(
                        elem, parent, elem.get(log_lib.util.xes.KEY_KEY),
                        elem.get(log_lib.util.xes.KEY_VALUE), tree)
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_DATE):
                try:
                    dt = ciso8601.parse_datetime(
                        elem.get(log_lib.util.xes.KEY_VALUE))
                    tree = __parse_attribute(
                        elem, parent, elem.get(log_lib.util.xes.KEY_KEY), dt,
                        tree)
                except TypeError:
                    logging.info("failed to parse date: " +
                                 str(elem.get(log_lib.util.xes.KEY_VALUE)))
                except ValueError:
                    logging.info("failed to parse date: " +
                                 str(elem.get(log_lib.util.xes.KEY_VALUE)))
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_EVENT):
                if event is not None:
                    raise SyntaxError(
                        'file contains <event> in another <event> tag')
                event = log_lib.log.Event()
                tree[elem] = event
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_TRACE):
                if len(log) >= max_no_traces_to_import:
                    break
                if trace is not None:
                    raise SyntaxError(
                        'file contains <trace> in another <trace> tag')
                trace = log_lib.log.Trace()
                tree[elem] = trace.attributes
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_FLOAT):
                if parent is not None:
                    try:
                        val = float(elem.get(log_lib.util.xes.KEY_VALUE))
                        tree = __parse_attribute(
                            elem, parent, elem.get(log_lib.util.xes.KEY_KEY),
                            val, tree)
                    except ValueError:
                        logging.info("failed to parse float: " +
                                     str(elem.get(log_lib.util.xes.KEY_VALUE)))
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_INT):
                if parent is not None:
                    try:
                        val = int(elem.get(log_lib.util.xes.KEY_VALUE))
                        tree = __parse_attribute(
                            elem, parent, elem.get(log_lib.util.xes.KEY_KEY),
                            val, tree)
                    except ValueError:
                        logging.info("failed to parse int: " +
                                     str(elem.get(log_lib.util.xes.KEY_VALUE)))
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_BOOLEAN):
                if parent is not None:
                    try:
                        val0 = elem.get(log_lib.util.xes.KEY_VALUE)
                        val = False
                        if str(val0).lower() == "true":
                            val = True
                        tree = __parse_attribute(
                            elem, parent, elem.get(log_lib.util.xes.KEY_KEY),
                            val, tree)
                    except ValueError:
                        logging.info("failed to parse boolean: " +
                                     str(elem.get(log_lib.util.xes.KEY_VALUE)))
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_LIST):
                if parent is not None:
                    # lists have no value, hence we put None as a value
                    tree = __parse_attribute(
                        elem, parent, elem.get(log_lib.util.xes.KEY_KEY), None,
                        tree)
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_ID):
                if parent is not None:
                    tree = __parse_attribute(
                        elem, parent, elem.get(log_lib.util.xes.KEY_KEY),
                        elem.get(log_lib.util.xes.KEY_VALUE), tree)
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_EXTENSION):
                if log is None:
                    raise SyntaxError('extension found outside of <log> tag')
                if elem.get(
                        log_lib.util.xes.KEY_NAME) is not None and elem.get(
                            log_lib.util.xes.KEY_PREFIX
                        ) is not None and elem.get(
                            log_lib.util.xes.KEY_URI) is not None:
                    log.extensions[elem.get(log_lib.util.xes.KEY_NAME)] = {
                        log_lib.util.xes.KEY_PREFIX:
                        elem.get(log_lib.util.xes.KEY_PREFIX),
                        log_lib.util.xes.KEY_URI:
                        elem.get(log_lib.util.xes.KEY_URI)
                    }
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_GLOBAL):
                if log is None:
                    raise SyntaxError('global found outside of <log> tag')
                if elem.get(log_lib.util.xes.KEY_SCOPE) is not None:
                    log.omni_present[elem.get(log_lib.util.xes.KEY_SCOPE)] = {}
                    tree[elem] = log.omni_present[elem.get(
                        log_lib.util.xes.KEY_SCOPE)]
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_CLASSIFIER):
                if log is None:
                    raise SyntaxError('classifier found outside of <log> tag')
                if elem.get(log_lib.util.xes.KEY_KEYS) is not None:
                    classifier_value = elem.get(log_lib.util.xes.KEY_KEYS)
                    if "'" in classifier_value:
                        log.classifiers[elem.get(
                            log_lib.util.xes.KEY_NAME)] = [
                                x for x in classifier_value.split("'")
                                if x.strip()
                            ]
                    else:
                        log.classifiers[elem.get(log_lib.util.xes.KEY_NAME
                                                 )] = classifier_value.split()
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_LOG):
                if log is not None:
                    raise SyntaxError('file contains > 1 <log> tags')
                log = log_lib.log.EventLog()
                tree[elem] = log.attributes
                continue

        elif tree_event == EVENT_END:
            if elem in tree:
                del tree[elem]
            elem.clear()
            if elem.getprevious() is not None:
                try:
                    del elem.getparent()[0]
                except TypeError:
                    pass

            if elem.tag.endswith(log_lib.util.xes.TAG_EVENT):
                if trace is not None:
                    trace.append(event)
                    event = None
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_TRACE):
                log.append(trace)
                trace = None
                continue

            elif elem.tag.endswith(log_lib.util.xes.TAG_LOG):
                continue

    del context

    if timestamp_sort:
        log = sorting.sort_timestamp(log,
                                     timestamp_key=timestamp_key,
                                     reverse_sort=reverse_sort)
    if insert_trace_indexes:
        log.insert_trace_index_as_event_attribute()

    return log
Esempio n. 30
0
def apply(df0, classifier_function=None, parameters=None):
    if parameters is None:
        parameters = {}

    if classifier_function is None:
        classifier_function = lambda x: x["event_activity"]

    min_acti_freq = parameters["min_acti_freq"] if "min_acti_freq" in parameters else 0
    min_edge_freq = parameters["min_edge_freq"] if "min_edge_freq" in parameters else 0

    df = df0.copy()
    df = general.preprocess(df, parameters=parameters)

    df = clean_frequency.apply(df, min_acti_freq=min_acti_freq)
    df = clean_arc_frequency.apply(df, min_freq=min_edge_freq)

    models = {}

    obj_types = [x for x in df.columns if not x.startswith("event_")]
    activities_repeated = Counter()
    activities = set()
    edges = Counter()
    start_activities = dict()
    end_activities = dict()
    acti_spec = Counter()

    for ot in obj_types:
        start_activities[ot] = set()
        end_activities[ot] = set()

        new_df = df[["event_id", "event_activity", "event_timestamp", ot]].dropna(subset=[ot])
        new_df = new_df.sort_values("event_timestamp")
        new_df = new_df.rename(
            columns={ot: "case:concept:name", "event_timestamp": "time:timestamp"})
        log = new_df.to_dict("r")
        for ev in log:
            ev["event_objtype"] = ot
            ev["concept:name"] = classifier_function(ev)
            del ev["event_objtype"]
            del ev["event_activity"]
            activities.add((ev["event_id"], ev["concept:name"]))

        log = EventStream(log)
        this_activities = set(x["concept:name"] for x in log)
        for act in this_activities:
            activities_repeated[act] += 1
        log = log_conv_factory.apply(log, variant=log_conv_factory.TO_EVENT_LOG)
        log = sorting.sort_timestamp(log, "time:timestamp")

        for trace in log:
            if trace:
                start_activities[ot].add(trace[0]["concept:name"])
                end_activities[ot].add(trace[-1]["concept:name"])
                for i in range(len(trace) - 1):
                    ev0 = trace[i]
                    ev1 = trace[i + 1]
                    edges[(ot, ev0["concept:name"], ev1["concept:name"], ev0["event_id"], ev1["event_id"], trace.attributes["concept:name"], ev0["time:timestamp"], ev1["time:timestamp"])] += 1
                    acti_spec[(ot, trace[i]["concept:name"], trace[i]["event_id"], trace.attributes["concept:name"], trace[i]["time:timestamp"])] += 1
                if len(trace) > 0:
                    acti_spec[(ot, trace[-1]["concept:name"], trace[-1]["event_id"], trace.attributes["concept:name"], trace[-1]["time:timestamp"])] += 1

        models[ot] = alpha_miner.apply(log, parameters=parameters)

    activities_repeated = set(x for x in activities_repeated if activities_repeated[x] > 1)
    activities = dict(Counter(list(x[1] for x in activities)))

    return {"type": "petri", "models": models, "activities": activities, "activities_repeated": activities_repeated,
            "edges": edges, "acti_spec": acti_spec}