Ejemplo n.º 1
0
    def get_case_duration(self,
                          session,
                          process,
                          use_transition,
                          no_samples,
                          max_ret_items=100000):
        all_slaves = list(self.slaves.keys())

        threads = []
        points = []

        for slave in all_slaves:
            slave_host = self.slaves[slave][1]
            slave_port = str(self.slaves[slave][2])

            m = CaseDurationRequest(session, slave_host, slave_port,
                                    use_transition, no_samples, process)
            m.max_ret_items = max_ret_items

            m.start()

            threads.append(m)

        for thread in threads:
            thread.join()

            points = points + thread.content["points"]

        points = sorted(points)
        if len(points) > max_ret_items:
            points = points_subset.pick_chosen_points_list(
                max_ret_items, points)

        return points
Ejemplo n.º 2
0
def get_kde_date_attribute(values, parameters=None):
    """
    Gets the KDE estimation for the distribution of a date attribute values

    Parameters
    -------------
    values
        Values of the date attribute value
    parameters
        Possible parameters of the algorithm, including:
            graph_points -> number of points to include in the graph


    Returns
    --------------
    x
        X-axis values to represent
    y
        Y-axis values to represent
    """
    if parameters is None:
        parameters = {}

    graph_points = parameters["graph_points"] if "graph_points" in parameters else 200
    points_to_sample = parameters["points_to_sample"] if "points_to_sample" in parameters else 400
    red_values = pick_chosen_points_list(points_to_sample, values)
    int_values = sorted(
        [x.replace(tzinfo=None).timestamp() for x in red_values])
    density = gaussian_kde(int_values)
    xs = np.linspace(min(int_values), max(int_values), graph_points)
    xs_transf = pd.to_datetime(xs * 10 ** 9)

    return [xs_transf, density(xs)]
Ejemplo n.º 3
0
def sample_dataframe(df, parameters=None):
    """
    Sample a dataframe on a given number of cases

    Parameters
    --------------
    df
        Dataframe
    parameters
        Parameters of the algorithm, including:
        - Parameters.CASE_ID_KEY
        - Parameters.CASE_ID_TO_RETAIN

    Returns
    -------------
    sampled_df
        Sampled dataframe
    """
    if parameters is None:
        parameters = {}

    case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                             parameters,
                                             constants.CASE_CONCEPT_NAME)
    max_no_cases = exec_utils.get_param_value(Parameters.MAX_NO_CASES,
                                              parameters, 100)

    case_ids = list(df[case_id_key].unique())
    case_id_to_retain = points_subset.pick_chosen_points_list(
        min(max_no_cases, len(case_ids)), case_ids)

    return df[df[case_id_key].isin(case_id_to_retain)]
Ejemplo n.º 4
0
    def get_events_per_time(self, session, process, use_transition, no_samples, max_ret_items=100000,
                            timestamp_key=xes.DEFAULT_TIMESTAMP_KEY):
        all_slaves = list(self.slaves.keys())

        threads = []
        points = []

        for slave in all_slaves:
            slave_host = self.slaves[slave][1]
            slave_port = str(self.slaves[slave][2])

            m = EventsPerTimeRequest(session, slave_host, slave_port, use_transition, no_samples, process)
            m.max_ret_items = max_ret_items
            m.timestamp_key = timestamp_key

            m.start()

            threads.append(m)

        for thread in threads:
            thread.join()

            points = points + thread.content["points"]

        points = sorted(points)
        if len(points) > max_ret_items:
            points = points_subset.pick_chosen_points_list(max_ret_items, points)

        return points
Ejemplo n.º 5
0
def apply(log, list_activities, sample_size, parameters):
    """
    Finds the performance spectrum provided a log
    and a list of activities

    Parameters
    -------------
    log
        Log
    list_activities
        List of activities interesting for the performance spectrum (at least two)
    sample_size
        Size of the sample
    parameters
        Parameters of the algorithm, including the activity key and the timestamp key

    Returns
    -------------
    points
        Points of the performance spectrum
    """
    if parameters is None:
        parameters = {}

    activity_key = parameters[
        constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    timestamp_key = parameters[
        constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY

    log = sorting.sort_timestamp_log(log, timestamp_key=timestamp_key)
    parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key
    log = basic_filter.filter_log_events_attr(log,
                                              list_activities,
                                              parameters=parameters)

    points = []

    for trace in log:
        for i in range(len(trace) - len(list_activities) + 1):
            acti_comb = [
                event[activity_key]
                for event in trace[i:i + len(list_activities)]
            ]

            if acti_comb == list_activities:
                timest_comb = [
                    event[timestamp_key].timestamp()
                    for event in trace[i:i + len(list_activities)]
                ]

                points.append(timest_comb)

    points = sorted(points, key=lambda x: x[0])

    if len(points) > sample_size:
        points = points_subset.pick_chosen_points_list(sample_size, points)

    return points
Ejemplo n.º 6
0
def get_numeric_attribute_values(path,
                                 log_name,
                                 managed_logs,
                                 parameters=None):
    if parameters is None:
        parameters = {}

    no_samples = parameters[
        PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES
    use_transition = parameters[
        PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION
    activity_key = DEFAULT_NAME_KEY if not use_transition else "@@classifier"
    filters = parameters[FILTERS] if FILTERS in parameters else []
    parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key
    parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key

    max_no_of_points_to_sample = parameters[
        "max_no_of_points_to_sample"] if "max_no_of_points_to_sample" in parameters else 100000

    attribute_key = parameters["attribute_key"]

    folder = os.path.join(path, log_name)
    columns = get_columns_to_import(filters, [attribute_key],
                                    use_transition=use_transition)

    parquet_list = parquet_importer.get_list_parquet(folder)

    overall_list = []
    count = 0
    for index, pq in enumerate(parquet_list):
        pq_basename = Path(pq).name
        if pq_basename in managed_logs:
            count = count + 1

        df = get_filtered_parquet(pq,
                                  columns,
                                  filters,
                                  use_transition=use_transition,
                                  parameters=parameters)
        df = df.dropna()

        if len(df) > max_no_of_points_to_sample:
            df = df.sample(n=max_no_of_points_to_sample)

        values = list(df[attribute_key])

        overall_list = overall_list + values

        if count >= no_samples:
            break

    overall_list = sorted(overall_list)
    if len(overall_list) > max_no_of_points_to_sample:
        overall_list = points_subset.pick_chosen_points_list(
            max_no_of_points_to_sample, overall_list)

    return overall_list
Ejemplo n.º 7
0
def get_events_per_time_first(path, log_name, managed_logs, parameters=None):
    if parameters is None:
        parameters = {}

    no_samples = parameters[
        PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES
    use_transition = parameters[
        PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION
    activity_key = DEFAULT_NAME_KEY if not use_transition else PARAMETER_PM4PYWS_CLASSIFIER
    filters = parameters[FILTERS] if FILTERS in parameters else []
    parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key
    parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key

    max_no_of_points_to_sample = parameters[
        "max_no_of_points_to_sample"] if "max_no_of_points_to_sample" in parameters else 100000

    folder = os.path.join(path, log_name)
    columns = get_columns_to_import(filters,
                                    [CASE_CONCEPT_NAME, DEFAULT_TIMESTAMP_KEY],
                                    use_transition=use_transition)

    parquet_list = parquet_importer.get_list_parquet(folder)

    overall_list = []
    count = 0
    for index, pq in enumerate(parquet_list):
        pq_basename = Path(pq).name
        if pq_basename in managed_logs:
            count = count + 1

            df = get_filtered_parquet(pq,
                                      columns,
                                      filters,
                                      use_transition=use_transition,
                                      parameters=parameters)
            df = df.groupby(CASE_CONCEPT_NAME).first()

            if len(df) > max_no_of_points_to_sample:
                df = df.sample(n=max_no_of_points_to_sample)

            date_values = [
                x.timestamp() for x in list(df[DEFAULT_TIMESTAMP_KEY])
            ]
            overall_list = overall_list + date_values

            if count >= no_samples:
                break

    overall_list = sorted(overall_list)
    if len(overall_list) > max_no_of_points_to_sample:
        overall_list = points_subset.pick_chosen_points_list(
            max_no_of_points_to_sample, overall_list)

    return overall_list
Ejemplo n.º 8
0
def get_case_duration(path, log_name, managed_logs, parameters=None):
    if parameters is None:
        parameters = {}

    no_samples = parameters[
        PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES
    use_transition = parameters[
        PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION
    activity_key = DEFAULT_NAME_KEY if not use_transition else "@@classifier"
    filters = parameters[FILTERS] if FILTERS in parameters else []
    parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key
    parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key

    max_no_of_points_to_sample = parameters[
        "max_no_of_points_to_sample"] if "max_no_of_points_to_sample" in parameters else 100000

    folder = os.path.join(path, log_name)
    columns = get_columns_to_import(filters,
                                    [CASE_CONCEPT_NAME, DEFAULT_TIMESTAMP_KEY],
                                    use_transition=use_transition)

    parquet_list = parquet_importer.get_list_parquet(folder)

    overall_list = []
    count = 0
    for index, pq in enumerate(parquet_list):
        pq_basename = Path(pq).name
        if pq_basename in managed_logs:
            count = count + 1

        df = get_filtered_parquet(pq,
                                  columns,
                                  filters,
                                  use_transition=use_transition,
                                  parameters=parameters)

        cases = case_statistics.get_cases_description(df,
                                                      parameters=parameters)
        duration_values = [x["caseDuration"] for x in cases.values()]

        overall_list = overall_list + duration_values

        if count >= no_samples:
            break

    overall_list = sorted(overall_list)
    if len(overall_list) > max_no_of_points_to_sample:
        overall_list = points_subset.pick_chosen_points_list(
            max_no_of_points_to_sample, overall_list)

    return overall_list
Ejemplo n.º 9
0
def get_kde_date_attribute(values, parameters=None):
    """
    Gets the KDE estimation for the distribution of a date attribute values

    Parameters
    -------------
    values
        Values of the date attribute value
    parameters
        Possible parameters of the algorithm, including:
            graph_points -> number of points to include in the graph


    Returns
    --------------
    x
        X-axis values to represent
    y
        Y-axis values to represent
    """
    if pkgutil.find_loader("scipy") and pkgutil.find_loader(
            "numpy") and pkgutil.find_loader("pandas"):
        from scipy.stats import gaussian_kde
        import numpy as np
        import pandas as pd

        if parameters is None:
            parameters = {}

        graph_points = exec_utils.get_param_value(Parameters.GRAPH_POINTS,
                                                  parameters, 200)
        points_to_sample = exec_utils.get_param_value(
            Parameters.POINT_TO_SAMPLE, parameters, 400)

        red_values = pick_chosen_points_list(points_to_sample, values)
        int_values = sorted(
            [x.replace(tzinfo=None).timestamp() for x in red_values])
        density = gaussian_kde(int_values)
        xs = np.linspace(min(int_values), max(int_values), graph_points)
        xs_transf = pd.to_datetime(xs * 10**9)

        return [xs_transf, density(xs)]
    else:
        msg = "scipy is not available. graphs cannot be built!"
        logging.error(msg)
        raise Exception(msg)
Ejemplo n.º 10
0
def apply(
    dataframe: pd.DataFrame,
    list_activities: List[str],
    sample_size: int,
    parameters: Optional[Dict[Union[str, Parameters], Any]] = None
) -> Dict[str, Any]:
    """
    Finds the disconnected performance spectrum provided a dataframe
    and a list of activities

    Parameters
    -------------
    dataframe
        Dataframe
    list_activities
        List of activities interesting for the performance spectrum (at least two)
    sample_size
        Size of the sample
    parameters
        Parameters of the algorithm,  including:
            - Parameters.ACTIVITY_KEY
            - Parameters.TIMESTAMP_KEY
            - Parameters.CASE_ID_KEY

    Returns
    -------------
    points
        Points of the performance spectrum
    """
    if parameters is None:
        parameters = {}

    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters,
                                              constants.CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, xes.DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               xes.DEFAULT_TIMESTAMP_KEY)

    sort_log_required = exec_utils.get_param_value(
        Parameters.SORT_LOG_REQUIRED, parameters, True)

    dataframe = dataframe[[case_id_glue, activity_key, timestamp_key]]
    dataframe = dataframe[dataframe[activity_key].isin(list_activities)]
    dataframe = pandas_utils.insert_index(dataframe,
                                          constants.DEFAULT_EVENT_INDEX_KEY)
    if sort_log_required:
        dataframe = dataframe.sort_values(
            [case_id_glue, timestamp_key, constants.DEFAULT_EVENT_INDEX_KEY])
    dataframe[timestamp_key] = dataframe[timestamp_key].astype(
        np.int64) / 10**9

    all_patterns = [(len(list_activities) - i,
                     gen_patterns(list_activities,
                                  len(list_activities) - i))
                    for i in range(len(list_activities) - 1)]

    def key(k, n):
        return k + str(n)

    def to_points(match, l):
        return {
            'case_id':
            match[key(case_id_glue, 0)],
            'points': [(match[key(activity_key,
                                  i)], match[key(timestamp_key, i)])
                       for i in range(l)]
        }

    points = []
    for l, patterns in all_patterns:
        # concat shifted and suffixed dataframes to get a dataframe that allows to check for the patterns
        dfs = [dataframe.add_suffix(str(i)).shift(-i) for i in range(l)]
        df_merged = pd.concat(dfs, axis=1)

        indices = [shift_index(dfs[i].index, i) for i in range(len(dfs))]
        mindex = pd.MultiIndex.from_arrays(indices)
        df_merged = df_merged.set_index(mindex)

        for i in range(l - 1):
            df_merged = df_merged[df_merged[key(case_id_glue, i)] == df_merged[
                key(case_id_glue, i + 1)]]

        column_list = [key(activity_key, i) for i in range(l)]
        matches = df_merged[np.isin(df_merged[column_list].sum(axis=1),
                                    patterns)]
        points.extend([to_points(m, l) for m in matches.to_dict('records')])
        # drop rows of this match to not discover subsets of this match again
        dataframe = dataframe.drop(
            [int(i) for indices in matches.index for i in indices[:-1]])
        pass

    points = sorted(points,
                    key=lambda x: min(x['points'], key=lambda x: x[1])[1])
    if len(points) > sample_size:
        points = points_subset.pick_chosen_points_list(sample_size, points)

    return points
Ejemplo n.º 11
0
    def __init__(self, trace: Trace, sync_net: PetriNet, sync_im: Marking, sync_fm: Marking,
                 parameters: Optional[Dict[Any, Any]] = None):
        """
        Constructor

        Parameters
        ---------------
        trace
            Trace
        sync_net
            Synchronous product net
        sync_im
            Initial marking
        sync_fm
            Final marking
        parameters
            Parameters of the algorithm, including:
            - Parameters.CASE_ID_KEY => attribute to use as case identifier
            - Parameters.ACTIVITY_KEY => attribute to use as activity
            - Parameters.COSTS => (if provided) the cost function (otherwise the default cost function is applied)
            - Parameters.SPLIT_IDX => (if provided) the split points as indices of elements of the trace
                (e.g. for ["A", "B", "C", "D", "E"], specifying [1,3] as split points means splitting at "B" and "D").
                If not provided, some split points at uniform distances are found.
            - Parameters.MAX_K_VALUE => the maximum number of split points that is allowed (trim the specified indexes
                if necessary).
            - Parameters.INCIDENCE_MATRIX => (if provided) the incidence matrix associated to the sync product net
            - Parameters.A => (if provided) the A numpy matrix of the incidence matrix
            - Parameters.CONSUMPTION_MATRIX => (if provided) the consumption matrix associated to the sync product net
            - Parameters.C => (if provided) the C numpy matrix of the consumption matrix
            - Parameters.FULL_BOOTSTRAP_REQUIRED => The preset/postset of places/transitions need to be inserted
        """
        if parameters is None:
            parameters = {}

        activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
        max_k_value = exec_utils.get_param_value(Parameters.MAX_K_VALUE, parameters, 5)
        costs = exec_utils.get_param_value(Parameters.COSTS, parameters, None)
        split_idx = exec_utils.get_param_value(Parameters.SPLIT_IDX, parameters, None)
        self.full_bootstrap_required = exec_utils.get_param_value(Parameters.FULL_BOOTSTRAP_REQUIRED, parameters, True)

        self.trace = [x[activity_key] for x in trace]
        if costs is None:
            costs = align_utils.construct_standard_cost_function(sync_net, align_utils.SKIP)
        if split_idx is None:
            split_idx = [i for i in range(1, len(trace))]
        self.split_idx = split_idx
        if len(self.split_idx) > max_k_value:
            self.split_idx = points_subset.pick_chosen_points_list(max_k_value, self.split_idx)
        self.k = len(self.split_idx) if len(self.split_idx) > 1 else 2
        self.sync_net = sync_net
        self.ini = sync_im
        self.fin = sync_fm
        self.costs = costs
        self.incidence_matrix = exec_utils.get_param_value(Parameters.INCIDENCE_MATRIX, parameters,
                                                           IncidenceMatrix(self.sync_net))
        self.consumption_matrix = exec_utils.get_param_value(Parameters.CONSUMPTION_MATRIX, parameters,
                                                             ConsumptionMatrix(self.sync_net))
        self.A = exec_utils.get_param_value(Parameters.A, parameters, np.asmatrix(self.incidence_matrix.a_matrix))
        self.C = exec_utils.get_param_value(Parameters.C, parameters, np.asmatrix(self.consumption_matrix.c_matrix))

        self.__build_entities()
Ejemplo n.º 12
0
def apply(
    log: EventLog,
    list_activities: List[str],
    sample_size: int,
    parameters: Optional[Dict[Union[str, Parameters], Any]] = None
) -> Dict[str, Any]:
    """
    Finds the disconnected performance spectrum provided a log
    and a list of activities

    Parameters
    -------------
    log
        Log
    list_activities
        List of activities interesting for the performance spectrum (at least two)
    sample_size
        Size of the sample
    parameters
        Parameters of the algorithm,  including:
            - Parameters.ACTIVITY_KEY
            - Parameters.TIMESTAMP_KEY

    Returns
    -------------
    points
        Points of the performance spectrum
    """
    if parameters is None:
        parameters = {}

    sort_log_required = exec_utils.get_param_value(
        Parameters.SORT_LOG_REQUIRED, parameters, True)

    all_acti_combs = set(
        tuple(list_activities[j:j + i])
        for i in range(2,
                       len(list_activities) + 1)
        for j in range(0,
                       len(list_activities) - i + 1))
    two_acti_combs = set((list_activities[i], list_activities[i + 1])
                         for i in range(len(list_activities) - 1))

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, xes.DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               xes.DEFAULT_TIMESTAMP_KEY)
    case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                             parameters,
                                             xes.DEFAULT_TRACEID_KEY)

    parameters[Parameters.ATTRIBUTE_KEY] = activity_key
    log = basic_filter.filter_log_events_attr(log,
                                              list_activities,
                                              parameters=parameters)
    if sort_log_required:
        log = sorting.sort_timestamp_log(log, timestamp_key=timestamp_key)

    points = []
    for trace in log:
        matches = [(i, i + 1) for i in range(len(trace) - 1)
                   if (trace[i][activity_key],
                       trace[i + 1][activity_key]) in two_acti_combs]

        i = 0
        while i < len(matches) - 1:
            matchAct = (trace[mi][activity_key]
                        for mi in (matches[i] + matches[i + 1][1:]))
            if matches[i][-1] == matches[i +
                                         1][0] and matchAct in all_acti_combs:
                matches[i] = matches[i] + matches[i + 1][1:]
                del matches[i + 1]
                i = 0
            else:
                i += 1

        if matches:
            matches = set(matches)
            timest_comb = [{
                'points':
                [(trace[i][activity_key], trace[i][timestamp_key].timestamp())
                 for i in match]
            } for match in matches]
            for p in timest_comb:
                p['case_id'] = trace.attributes[case_id_key]

            points += timest_comb

    points = sorted(points,
                    key=lambda x: min(x['points'], key=lambda x: x[1])[1])

    if len(points) > sample_size:
        points = points_subset.pick_chosen_points_list(sample_size, points)

    return points
Ejemplo n.º 13
0
def apply(
    log: EventLog,
    list_activities: List[str],
    sample_size: int,
    parameters: Optional[Dict[Union[str, Parameters], Any]] = None
) -> Dict[str, Any]:
    """
    Finds the performance spectrum provided a log
    and a list of activities

    Parameters
    -------------
    log
        Log
    list_activities
        List of activities interesting for the performance spectrum (at least two)
    sample_size
        Size of the sample
    parameters
        Parameters of the algorithm,  including:
            - Parameters.ACTIVITY_KEY
            - Parameters.TIMESTAMP_KEY

    Returns
    -------------
    points
        Points of the performance spectrum
    """
    if parameters is None:
        parameters = {}

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, xes.DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               xes.DEFAULT_TIMESTAMP_KEY)
    sort_log_required = exec_utils.get_param_value(
        Parameters.SORT_LOG_REQUIRED, parameters, True)

    parameters[Parameters.ATTRIBUTE_KEY] = activity_key
    log = basic_filter.filter_log_events_attr(log,
                                              list_activities,
                                              parameters=parameters)
    if sort_log_required:
        log = sorting.sort_timestamp_log(log, timestamp_key=timestamp_key)

    points = []

    for trace in log:
        for i in range(len(trace) - len(list_activities) + 1):
            acti_comb = [
                event[activity_key]
                for event in trace[i:i + len(list_activities)]
            ]

            if acti_comb == list_activities:
                timest_comb = [
                    event[timestamp_key].timestamp()
                    for event in trace[i:i + len(list_activities)]
                ]

                points.append(timest_comb)

    points = sorted(points, key=lambda x: x[0])

    if len(points) > sample_size:
        points = points_subset.pick_chosen_points_list(sample_size, points)

    return points