コード例 #1
0
    def __init__(self, path, parameters=None):
        """
        Initialize the iterable log_skeleton object

        Parameters
        -------------
        path
            Path to the XES log_skeleton
        """
        if parameters is None:
            parameters = {}
        self.path = path
        self.acceptance_condition = exec_utils.get_param_value(
            Parameters.ACCEPTANCE_CONDITION, parameters, lambda x: True)
        self.date_parser = dt_parser.get()
        self.reset()
コード例 #2
0
ファイル: line_by_line.py プロジェクト: luisfsts/pm4py-source
def import_log(filename, parameters=None):
    """
    Import a log object from a XML file
    containing the traces, the events and the simple attributes of them

    Parameters
    -----------
    filename
        XES file to parse
    parameters
        Parameters of the algorithm, including
            Parameters.TIMESTAMP_SORT -> Specify if we should sort log by timestamp
            Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log by using this key
            Parameters.REVERSE_SORT -> Specify in which direction the log should be sorted
            Parameters.INSERT_TRACE_INDICES -> Specify if trace indexes should be added as event attribute for each event
            Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log (read in order in the XML file)
            Parameters.MAX_BYTES -> Maximum number of bytes to read
            Parameters.SKYP_BYTES -> Number of bytes to skip


    Returns
    -----------
    xes
        XES file
    """
    if parameters is None:
        parameters = {}

    date_parser = dt_parser.get()
    timestamp_sort = param_util.fetch(Parameters.TIMESTAMP_SORT, parameters)
    timestamp_key = param_util.fetch(Parameters.TIMESTAMP_KEY, parameters)
    reverse_sort = param_util.fetch(Parameters.REVERSE_SORT, parameters)
    insert_trace_indexes = param_util.fetch(Parameters.INSERT_TRACE_INDICES, parameters)
    max_no_traces_to_import = param_util.fetch(Parameters.MAX_TRACES, parameters)
    skip_bytes = param_util.fetch(Parameters.SKYP_BYTES, parameters)
    max_bytes_to_read = param_util.fetch(Parameters.MAX_BYTES, parameters)

    file_size = os.stat(filename).st_size

    if file_size > max_bytes_to_read:
        skip_bytes = file_size - max_bytes_to_read

    log = EventLog()
    tracecount = 0
    trace = None
    event = None

    f = open(filename, "r")
    f.seek(skip_bytes)

    for line in f:
        content = line.split("\"")
        if len(content) > 0:
            tag = content[0].split("<")[-1]
            if trace is not None:
                if event is not None:
                    if len(content) == 5:
                        if tag.startswith("string"):
                            event[content[1]] = content[3]
                        elif tag.startswith("date"):
                            event[content[1]] = date_parser.apply(content[3])
                        elif tag.startswith("int"):
                            event[content[1]] = int(content[3])
                        elif tag.startswith("float"):
                            event[content[1]] = float(content[3])
                        else:
                            event[content[1]] = content[3]
                    elif tag.startswith("/event"):
                        trace.append(event)
                        event = None
                elif tag.startswith("event"):
                    event = Event()
                elif len(content) == 5:
                    if tag.startswith("string"):
                        trace.attributes[content[1]] = content[3]
                    elif tag.startswith("date"):
                        trace.attributes[content[1]] = date_parser.apply(content[3])
                    elif tag.startswith("int"):
                        trace.attributes[content[1]] = int(content[3])
                    elif tag.startswith("float"):
                        trace.attributes[content[1]] = float(content[3])
                    else:
                        trace.attributes[content[1]] = content[3]
                elif tag.startswith("/trace"):
                    log.append(trace)
                    tracecount += 1
                    if tracecount > max_no_traces_to_import:
                        break
                    trace = None
            elif tag.startswith("trace"):
                trace = Trace()
    f.close()

    if timestamp_sort:
        log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort)
    if insert_trace_indexes:
        log.insert_trace_index_as_event_attribute()

    return log
コード例 #3
0
def from_dict_to_event(event_dict):
    timestamp_field_name = "time:timestamp"
    if timestamp_field_name in event_dict.keys():
        event_dict[timestamp_field_name] = dt_parser.get().apply(
            event_dict[timestamp_field_name])
    return Event(event_dict)
コード例 #4
0
def import_from_context(context, num_traces, parameters=None):
    """
    Import a XES log from an iterparse context

    Parameters
    --------------
    context
        Iterparse context
    num_traces
        Number of traces of the XES log
    parameters
        Parameters of the algorithm

    Returns
    --------------
    log
        Event log
    """
    if parameters is None:
        parameters = {}

    max_no_traces_to_import = exec_utils.get_param_value(Parameters.MAX_TRACES, parameters, sys.maxsize)
    timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT, parameters, False)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
                                               xes_constants.DEFAULT_TIMESTAMP_KEY)
    reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT, parameters, False)
    show_progress_bar = exec_utils.get_param_value(Parameters.SHOW_PROGRESS_BAR, parameters, True)

    date_parser = dt_parser.get()
    progress = None
    if pkgutil.find_loader("tqdm") and show_progress_bar:
        from tqdm.auto import tqdm
        progress = tqdm(total=num_traces, desc="parsing log, completed traces :: ")

    log = None
    trace = None
    event = None

    tree = {}

    for tree_event, elem in context:
        if tree_event == _EVENT_START:  # starting to read
            parent = tree[elem.getparent()] if elem.getparent() in tree else None

            if elem.tag.endswith(xes_constants.TAG_STRING):
                if parent is not None:
                    tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE), tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_DATE):
                try:
                    dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE))
                    tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), dt, tree)
                except TypeError:
                    logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE)))
                except ValueError:
                    logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_EVENT):
                if event is not None:
                    raise SyntaxError('file contains <event> in another <event> tag')
                event = Event()
                tree[elem] = event
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                if len(log) >= max_no_traces_to_import:
                    break
                if trace is not None:
                    raise SyntaxError('file contains <trace> in another <trace> tag')
                trace = Trace()
                tree[elem] = trace.attributes
                continue

            elif elem.tag.endswith(xes_constants.TAG_FLOAT):
                if parent is not None:
                    try:
                        val = float(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree)
                    except ValueError:
                        logging.info("failed to parse float: " + str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_INT):
                if parent is not None:
                    try:
                        val = int(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree)
                    except ValueError:
                        logging.info("failed to parse int: " + str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_BOOLEAN):
                if parent is not None:
                    try:
                        val0 = elem.get(xes_constants.KEY_VALUE)
                        val = False
                        if str(val0).lower() == "true":
                            val = True
                        tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree)
                    except ValueError:
                        logging.info("failed to parse boolean: " + str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_LIST):
                if parent is not None:
                    # lists have no value, hence we put None as a value
                    tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), None, tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_ID):
                if parent is not None:
                    tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE), tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_EXTENSION):
                if log is None:
                    raise SyntaxError('extension found outside of <log> tag')
                if elem.get(xes_constants.KEY_NAME) is not None and elem.get(
                        xes_constants.KEY_PREFIX) is not None and elem.get(xes_constants.KEY_URI) is not None:
                    log.extensions[elem.get(xes_constants.KEY_NAME)] = {
                        xes_constants.KEY_PREFIX: elem.get(xes_constants.KEY_PREFIX),
                        xes_constants.KEY_URI: elem.get(xes_constants.KEY_URI)}
                continue

            elif elem.tag.endswith(xes_constants.TAG_GLOBAL):
                if log is None:
                    raise SyntaxError('global found outside of <log> tag')
                if elem.get(xes_constants.KEY_SCOPE) is not None:
                    log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {}
                    tree[elem] = log.omni_present[elem.get(xes_constants.KEY_SCOPE)]
                continue

            elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER):
                if log is None:
                    raise SyntaxError('classifier found outside of <log> tag')
                if elem.get(xes_constants.KEY_KEYS) is not None:
                    classifier_value = elem.get(xes_constants.KEY_KEYS)
                    if "'" in classifier_value:
                        log.classifiers[elem.get(xes_constants.KEY_NAME)] = [x for x in classifier_value.split("'")
                                                                             if x.strip()]
                    else:
                        log.classifiers[elem.get(xes_constants.KEY_NAME)] = classifier_value.split()
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                if log is not None:
                    raise SyntaxError('file contains > 1 <log> tags')
                log = EventLog()
                tree[elem] = log.attributes
                continue

        elif tree_event == _EVENT_END:
            if elem in tree:
                del tree[elem]
            elem.clear()
            if elem.getprevious() is not None:
                try:
                    del elem.getparent()[0]
                except TypeError:
                    pass

            if elem.tag.endswith(xes_constants.TAG_EVENT):
                if trace is not None:
                    trace.append(event)
                    event = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                log.append(trace)

                if progress is not None:
                    progress.update()

                trace = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                continue

    # gracefully close progress bar
    if progress is not None:
        progress.close()
    del context, progress

    if timestamp_sort:
        log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort)

    # sets the activity key as default classifier in the log's properties
    log.properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_constants.DEFAULT_NAME_KEY
    log.properties[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = xes_constants.DEFAULT_NAME_KEY
    # sets the default timestamp key
    log.properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_constants.DEFAULT_TIMESTAMP_KEY
    # sets the default resource key
    log.properties[constants.PARAMETER_CONSTANT_RESOURCE_KEY] = xes_constants.DEFAULT_RESOURCE_KEY
    # sets the default transition key
    log.properties[constants.PARAMETER_CONSTANT_TRANSITION_KEY] = xes_constants.DEFAULT_TRANSITION_KEY
    # sets the default group key
    log.properties[constants.PARAMETER_CONSTANT_GROUP_KEY] = xes_constants.DEFAULT_GROUP_KEY

    return log
コード例 #5
0
def import_log(filename, parameters=None):
    """
    Imports an XES file into a log object

    Parameters
    ----------
    filename:
        Absolute filename
    parameters
        Parameters of the algorithm, including
            Parameters.TIMESTAMP_SORT -> Specify if we should sort log by timestamp
            Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log by using this key
            Parameters.REVERSE_SORT -> Specify in which direction the log should be sorted
            Parameters.INSERT_TRACE_INDICES -> Specify if trace indexes should be added as event attribute for each event
            Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log (read in order in the XML file)

    Returns
    -------
    log : :class:`pm4py.log.log.EventLog`
        A log
    """

    parameters = dict() if parameters is None else parameters

    insert_trace_indexes = param_util.fetch(Parameters.INSERT_TRACE_INDICES,
                                            parameters)
    max_no_traces_to_import = param_util.fetch(Parameters.MAX_TRACES,
                                               parameters)

    date_parser = dt_parser.get()
    context = etree.iterparse(filename, events=[_EVENT_START, _EVENT_END])

    # check to see if log has a namespace before looking for traces  (but this might be more effort than worth)
    # but you could just assume that log use on the standard namespace desbried in XES
    # to only find elements that start a trace use tag="{http://www.xes-standard.org}trace"
    # or just use the {*} syntax to match to all namespaces with a trace element

    #count number of traces and setup progress bar
    no_trace = sum([
        1 for trace in etree.iterparse(
            filename, events=[_EVENT_START], tag="{*}trace")
    ])

    # make tqdm facultative
    progress = None
    if pkgutil.find_loader("tqdm"):
        from tqdm.auto import tqdm
        progress = tqdm(total=no_trace,
                        desc="parsing log, completed traces :: ")

    log = None
    trace = None
    event = None

    tree = {}
    for tree_event, elem in context:
        if tree_event == _EVENT_START:  # starting to read
            parent = tree[
                elem.getparent()] if elem.getparent() in tree else None

            if elem.tag.endswith(xes_constants.TAG_STRING):
                if parent is not None:
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE),
                                             tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_DATE):
                try:
                    dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE))
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             dt, tree)
                except TypeError:
                    logging.info("failed to parse date: " +
                                 str(elem.get(xes_constants.KEY_VALUE)))
                except ValueError:
                    logging.info("failed to parse date: " +
                                 str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_EVENT):
                if event is not None:
                    raise SyntaxError(
                        'file contains <event> in another <event> tag')
                event = Event()
                tree[elem] = event
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                if len(log) >= max_no_traces_to_import:
                    break
                if trace is not None:
                    raise SyntaxError(
                        'file contains <trace> in another <trace> tag')
                trace = Trace()
                tree[elem] = trace.attributes
                continue

            elif elem.tag.endswith(xes_constants.TAG_FLOAT):
                if parent is not None:
                    try:
                        val = float(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree)
                    except ValueError:
                        logging.info("failed to parse float: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_INT):
                if parent is not None:
                    try:
                        val = int(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree)
                    except ValueError:
                        logging.info("failed to parse int: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_BOOLEAN):
                if parent is not None:
                    try:
                        val0 = elem.get(xes_constants.KEY_VALUE)
                        val = False
                        if str(val0).lower() == "true":
                            val = True
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree)
                    except ValueError:
                        logging.info("failed to parse boolean: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_LIST):
                if parent is not None:
                    # lists have no value, hence we put None as a value
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             None, tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_ID):
                if parent is not None:
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE),
                                             tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_EXTENSION):
                if log is None:
                    raise SyntaxError('extension found outside of <log> tag')
                if elem.get(xes_constants.KEY_NAME) is not None and elem.get(
                        xes_constants.KEY_PREFIX) is not None and elem.get(
                            xes_constants.KEY_URI) is not None:
                    log.extensions[elem.get(xes_constants.KEY_NAME)] = {
                        xes_constants.KEY_PREFIX:
                        elem.get(xes_constants.KEY_PREFIX),
                        xes_constants.KEY_URI:
                        elem.get(xes_constants.KEY_URI)
                    }
                continue

            elif elem.tag.endswith(xes_constants.TAG_GLOBAL):
                if log is None:
                    raise SyntaxError('global found outside of <log> tag')
                if elem.get(xes_constants.KEY_SCOPE) is not None:
                    log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {}
                    tree[elem] = log.omni_present[elem.get(
                        xes_constants.KEY_SCOPE)]
                continue

            elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER):
                if log is None:
                    raise SyntaxError('classifier found outside of <log> tag')
                if elem.get(xes_constants.KEY_KEYS) is not None:
                    classifier_value = elem.get(xes_constants.KEY_KEYS)
                    if "'" in classifier_value:
                        log.classifiers[elem.get(xes_constants.KEY_NAME)] = [
                            x for x in classifier_value.split("'")
                            if x.strip()
                        ]
                    else:
                        log.classifiers[elem.get(xes_constants.KEY_NAME
                                                 )] = classifier_value.split()
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                if log is not None:
                    raise SyntaxError('file contains > 1 <log> tags')
                log = EventLog()
                tree[elem] = log.attributes
                continue

        elif tree_event == _EVENT_END:
            if elem in tree:
                del tree[elem]
            elem.clear()
            if elem.getprevious() is not None:
                try:
                    del elem.getparent()[0]
                except TypeError:
                    pass

            if elem.tag.endswith(xes_constants.TAG_EVENT):
                if trace is not None:
                    trace.append(event)
                    event = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                log.append(trace)

                #update progress bar as we have a completed trace
                if progress is not None:
                    progress.update()

                trace = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                continue

    #gracefully close progress bar
    if progress is not None:
        progress.close()
    del context, progress

    if Parameters.TIMESTAMP_SORT in parameters and parameters[
            Parameters.TIMESTAMP_SORT]:
        log = sorting.sort_timestamp(
            log,
            timestamp_key=param_util.fetch(Parameters.TIMESTAMP_KEY,
                                           parameters),
            reverse_sort=param_util.fetch(Parameters.REVERSE_SORT, parameters))
    if insert_trace_indexes:
        log = index_attribute.insert_event_index_as_event_attribute(log)

    return log
コード例 #6
0
ファイル: line_by_line.py プロジェクト: pm4py/pm4py-core
def import_log_from_file_object(f,
                                encoding,
                                file_size=sys.maxsize,
                                parameters=None):
    """
    Import a log object from a (XML) file object

    Parameters
    -----------
    f
        file object
    encoding
        Encoding
    file_size
        Size of the file (measured on disk)
    parameters
        Parameters of the algorithm, including
            Parameters.TIMESTAMP_SORT -> Specify if we should sort log by timestamp
            Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log by using this key
            Parameters.REVERSE_SORT -> Specify in which direction the log should be sorted
            Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log (read in order in the XML file)
            Parameters.MAX_BYTES -> Maximum number of bytes to read
            Parameters.SKYP_BYTES -> Number of bytes to skip
            Parameters.SET_ATTRIBUTES_TO_READ -> Names of the attributes that should be parsed. If not specified,
                                                then, all the attributes are parsed.

    Returns
    -----------
    log
        Log file
    """
    values_dict = {}
    date_parser = dt_parser.get()

    set_attributes_to_read = exec_utils.get_param_value(
        Parameters.SET_ATTRIBUTES_TO_READ, parameters, None)
    max_no_traces_to_import = exec_utils.get_param_value(
        Parameters.MAX_TRACES, parameters, sys.maxsize)
    timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT,
                                                parameters, False)
    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT,
                                              parameters, False)

    skip_bytes = exec_utils.get_param_value(Parameters.SKIP_BYTES, parameters,
                                            False)
    max_bytes_to_read = exec_utils.get_param_value(Parameters.MAX_BYTES,
                                                   parameters, sys.maxsize)

    if file_size > max_bytes_to_read:
        skip_bytes = file_size - max_bytes_to_read

    log = EventLog()
    tracecount = 0
    trace = None
    event = None

    f.seek(skip_bytes)

    for line in f:
        content = line.decode(encoding).split("\"")
        if len(content) > 0:
            tag = content[0].split("<")[-1]
            if trace is not None:
                if event is not None:
                    if len(content) == 5:
                        key, value = read_attribute_key_value(
                            tag, content, date_parser, values_dict,
                            set_attributes_to_read)
                        if value is not None:
                            event[key] = value
                    elif tag.startswith("/event"):
                        trace.append(event)
                        event = None
                elif tag.startswith("event"):
                    event = Event()
                elif len(content) == 5:
                    key, value = read_attribute_key_value(
                        tag, content, date_parser, values_dict,
                        set_attributes_to_read)
                    if value is not None:
                        trace.attributes[key] = value
                elif tag.startswith("/trace"):
                    log.append(trace)
                    tracecount += 1
                    if tracecount > max_no_traces_to_import:
                        break
                    trace = None
            elif tag.startswith("trace"):
                trace = Trace()

    if timestamp_sort:
        log = sorting.sort_timestamp(log,
                                     timestamp_key=timestamp_key,
                                     reverse_sort=reverse_sort)

    # sets the activity key as default classifier in the log's properties
    log.properties[
        constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_constants.DEFAULT_NAME_KEY
    log.properties[
        constants.
        PARAMETER_CONSTANT_ATTRIBUTE_KEY] = xes_constants.DEFAULT_NAME_KEY
    # sets the default timestamp key
    log.properties[
        constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_constants.DEFAULT_TIMESTAMP_KEY
    # sets the default resource key
    log.properties[
        constants.
        PARAMETER_CONSTANT_RESOURCE_KEY] = xes_constants.DEFAULT_RESOURCE_KEY
    # sets the default transition key
    log.properties[
        constants.
        PARAMETER_CONSTANT_TRANSITION_KEY] = xes_constants.DEFAULT_TRANSITION_KEY
    # sets the default group key
    log.properties[
        constants.
        PARAMETER_CONSTANT_GROUP_KEY] = xes_constants.DEFAULT_GROUP_KEY

    return log
コード例 #7
0
ファイル: iterparse.py プロジェクト: hieule23/pm4py-core
def import_log(filename, parameters=None):
    """
    Imports an XES file into a log_skeleton object

    Parameters
    ----------
    filename:
        Absolute filename
    parameters
        Parameters of the algorithm, including
            Parameters.TIMESTAMP_SORT -> Specify if we should sort log_skeleton by timestamp
            Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log_skeleton by using this key
            Parameters.REVERSE_SORT -> Specify in which direction the log_skeleton should be sorted
            Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log_skeleton (read in order in the XML file)

    Returns
    -------
    log_skeleton : :class:`pm4py.log_skeleton.log_skeleton.EventLog`
        A log_skeleton
    """
    from lxml import etree

    if parameters is None:
        parameters = {}

    max_no_traces_to_import = exec_utils.get_param_value(
        Parameters.MAX_TRACES, parameters, sys.maxsize)
    timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT,
                                                parameters, False)
    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT,
                                              parameters, False)

    date_parser = dt_parser.get()

    # count number of traces and setup progress bar
    no_trace = count_traces(filename)

    context = etree.iterparse(filename, events=[_EVENT_START, _EVENT_END])

    # make tqdm facultative
    progress = None
    if pkgutil.find_loader("tqdm"):
        from tqdm.auto import tqdm
        progress = tqdm(total=no_trace,
                        desc="parsing log_skeleton, completed traces :: ")

    log = None
    trace = None
    event = None

    tree = {}
    for tree_event, elem in context:
        if tree_event == _EVENT_START:  # starting to read
            parent = tree[
                elem.getparent()] if elem.getparent() in tree else None

            if elem.tag.endswith(xes_constants.TAG_STRING):
                if parent is not None:
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE),
                                             tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_DATE):
                try:
                    dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE))
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             dt, tree)
                except TypeError:
                    logging.info("failed to parse date: " +
                                 str(elem.get(xes_constants.KEY_VALUE)))
                except ValueError:
                    logging.info("failed to parse date: " +
                                 str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_EVENT):
                if event is not None:
                    raise SyntaxError(
                        'file contains <event> in another <event> tag')
                event = Event()
                tree[elem] = event
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                if len(log) >= max_no_traces_to_import:
                    break
                if trace is not None:
                    raise SyntaxError(
                        'file contains <trace> in another <trace> tag')
                trace = Trace()
                tree[elem] = trace.attributes
                continue

            elif elem.tag.endswith(xes_constants.TAG_FLOAT):
                if parent is not None:
                    try:
                        val = float(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree)
                    except ValueError:
                        logging.info("failed to parse float: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_INT):
                if parent is not None:
                    try:
                        val = int(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree)
                    except ValueError:
                        logging.info("failed to parse int: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_BOOLEAN):
                if parent is not None:
                    try:
                        val0 = elem.get(xes_constants.KEY_VALUE)
                        val = False
                        if str(val0).lower() == "true":
                            val = True
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree)
                    except ValueError:
                        logging.info("failed to parse boolean: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_LIST):
                if parent is not None:
                    # lists have no value, hence we put None as a value
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             None, tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_ID):
                if parent is not None:
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE),
                                             tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_EXTENSION):
                if log is None:
                    raise SyntaxError(
                        'extension found outside of <log_skeleton> tag')
                if elem.get(xes_constants.KEY_NAME) is not None and elem.get(
                        xes_constants.KEY_PREFIX) is not None and elem.get(
                            xes_constants.KEY_URI) is not None:
                    log.extensions[elem.get(xes_constants.KEY_NAME)] = {
                        xes_constants.KEY_PREFIX:
                        elem.get(xes_constants.KEY_PREFIX),
                        xes_constants.KEY_URI:
                        elem.get(xes_constants.KEY_URI)
                    }
                continue

            elif elem.tag.endswith(xes_constants.TAG_GLOBAL):
                if log is None:
                    raise SyntaxError(
                        'global found outside of <log_skeleton> tag')
                if elem.get(xes_constants.KEY_SCOPE) is not None:
                    log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {}
                    tree[elem] = log.omni_present[elem.get(
                        xes_constants.KEY_SCOPE)]
                continue

            elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER):
                if log is None:
                    raise SyntaxError(
                        'classifier found outside of <log_skeleton> tag')
                if elem.get(xes_constants.KEY_KEYS) is not None:
                    classifier_value = elem.get(xes_constants.KEY_KEYS)
                    if "'" in classifier_value:
                        log.classifiers[elem.get(xes_constants.KEY_NAME)] = [
                            x for x in classifier_value.split("'")
                            if x.strip()
                        ]
                    else:
                        log.classifiers[elem.get(xes_constants.KEY_NAME
                                                 )] = classifier_value.split()
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                if log is not None:
                    raise SyntaxError('file contains > 1 <log_skeleton> tags')
                log = EventLog()
                tree[elem] = log.attributes
                continue

        elif tree_event == _EVENT_END:
            if elem in tree:
                del tree[elem]
            elem.clear()
            if elem.getprevious() is not None:
                try:
                    del elem.getparent()[0]
                except TypeError:
                    pass

            if elem.tag.endswith(xes_constants.TAG_EVENT):
                if trace is not None:
                    trace.append(event)
                    event = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                log.append(trace)

                # update progress bar as we have a completed trace
                if progress is not None:
                    progress.update()

                trace = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                continue

    # gracefully close progress bar
    if progress is not None:
        progress.close()
    del context, progress

    if timestamp_sort:
        log = sorting.sort_timestamp(log,
                                     timestamp_key=timestamp_key,
                                     reverse_sort=reverse_sort)

    return log