def get_case_duration(self, session, process, use_transition, no_samples, max_ret_items=100000): all_slaves = list(self.slaves.keys()) threads = [] points = [] for slave in all_slaves: slave_host = self.slaves[slave][1] slave_port = str(self.slaves[slave][2]) m = CaseDurationRequest(session, slave_host, slave_port, use_transition, no_samples, process) m.max_ret_items = max_ret_items m.start() threads.append(m) for thread in threads: thread.join() points = points + thread.content["points"] points = sorted(points) if len(points) > max_ret_items: points = points_subset.pick_chosen_points_list( max_ret_items, points) return points
def get_kde_date_attribute(values, parameters=None): """ Gets the KDE estimation for the distribution of a date attribute values Parameters ------------- values Values of the date attribute value parameters Possible parameters of the algorithm, including: graph_points -> number of points to include in the graph Returns -------------- x X-axis values to represent y Y-axis values to represent """ if parameters is None: parameters = {} graph_points = parameters["graph_points"] if "graph_points" in parameters else 200 points_to_sample = parameters["points_to_sample"] if "points_to_sample" in parameters else 400 red_values = pick_chosen_points_list(points_to_sample, values) int_values = sorted( [x.replace(tzinfo=None).timestamp() for x in red_values]) density = gaussian_kde(int_values) xs = np.linspace(min(int_values), max(int_values), graph_points) xs_transf = pd.to_datetime(xs * 10 ** 9) return [xs_transf, density(xs)]
def sample_dataframe(df, parameters=None): """ Sample a dataframe on a given number of cases Parameters -------------- df Dataframe parameters Parameters of the algorithm, including: - Parameters.CASE_ID_KEY - Parameters.CASE_ID_TO_RETAIN Returns ------------- sampled_df Sampled dataframe """ if parameters is None: parameters = {} case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) max_no_cases = exec_utils.get_param_value(Parameters.MAX_NO_CASES, parameters, 100) case_ids = list(df[case_id_key].unique()) case_id_to_retain = points_subset.pick_chosen_points_list( min(max_no_cases, len(case_ids)), case_ids) return df[df[case_id_key].isin(case_id_to_retain)]
def get_events_per_time(self, session, process, use_transition, no_samples, max_ret_items=100000, timestamp_key=xes.DEFAULT_TIMESTAMP_KEY): all_slaves = list(self.slaves.keys()) threads = [] points = [] for slave in all_slaves: slave_host = self.slaves[slave][1] slave_port = str(self.slaves[slave][2]) m = EventsPerTimeRequest(session, slave_host, slave_port, use_transition, no_samples, process) m.max_ret_items = max_ret_items m.timestamp_key = timestamp_key m.start() threads.append(m) for thread in threads: thread.join() points = points + thread.content["points"] points = sorted(points) if len(points) > max_ret_items: points = points_subset.pick_chosen_points_list(max_ret_items, points) return points
def apply(log, list_activities, sample_size, parameters): """ Finds the performance spectrum provided a log and a list of activities Parameters ------------- log Log list_activities List of activities interesting for the performance spectrum (at least two) sample_size Size of the sample parameters Parameters of the algorithm, including the activity key and the timestamp key Returns ------------- points Points of the performance spectrum """ if parameters is None: parameters = {} activity_key = parameters[ constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY timestamp_key = parameters[ constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY log = sorting.sort_timestamp_log(log, timestamp_key=timestamp_key) parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key log = basic_filter.filter_log_events_attr(log, list_activities, parameters=parameters) points = [] for trace in log: for i in range(len(trace) - len(list_activities) + 1): acti_comb = [ event[activity_key] for event in trace[i:i + len(list_activities)] ] if acti_comb == list_activities: timest_comb = [ event[timestamp_key].timestamp() for event in trace[i:i + len(list_activities)] ] points.append(timest_comb) points = sorted(points, key=lambda x: x[0]) if len(points) > sample_size: points = points_subset.pick_chosen_points_list(sample_size, points) return points
def get_numeric_attribute_values(path, log_name, managed_logs, parameters=None): if parameters is None: parameters = {} no_samples = parameters[ PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES use_transition = parameters[ PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION activity_key = DEFAULT_NAME_KEY if not use_transition else "@@classifier" filters = parameters[FILTERS] if FILTERS in parameters else [] parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key max_no_of_points_to_sample = parameters[ "max_no_of_points_to_sample"] if "max_no_of_points_to_sample" in parameters else 100000 attribute_key = parameters["attribute_key"] folder = os.path.join(path, log_name) columns = get_columns_to_import(filters, [attribute_key], use_transition=use_transition) parquet_list = parquet_importer.get_list_parquet(folder) overall_list = [] count = 0 for index, pq in enumerate(parquet_list): pq_basename = Path(pq).name if pq_basename in managed_logs: count = count + 1 df = get_filtered_parquet(pq, columns, filters, use_transition=use_transition, parameters=parameters) df = df.dropna() if len(df) > max_no_of_points_to_sample: df = df.sample(n=max_no_of_points_to_sample) values = list(df[attribute_key]) overall_list = overall_list + values if count >= no_samples: break overall_list = sorted(overall_list) if len(overall_list) > max_no_of_points_to_sample: overall_list = points_subset.pick_chosen_points_list( max_no_of_points_to_sample, overall_list) return overall_list
def get_events_per_time_first(path, log_name, managed_logs, parameters=None): if parameters is None: parameters = {} no_samples = parameters[ PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES use_transition = parameters[ PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION activity_key = DEFAULT_NAME_KEY if not use_transition else PARAMETER_PM4PYWS_CLASSIFIER filters = parameters[FILTERS] if FILTERS in parameters else [] parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key max_no_of_points_to_sample = parameters[ "max_no_of_points_to_sample"] if "max_no_of_points_to_sample" in parameters else 100000 folder = os.path.join(path, log_name) columns = get_columns_to_import(filters, [CASE_CONCEPT_NAME, DEFAULT_TIMESTAMP_KEY], use_transition=use_transition) parquet_list = parquet_importer.get_list_parquet(folder) overall_list = [] count = 0 for index, pq in enumerate(parquet_list): pq_basename = Path(pq).name if pq_basename in managed_logs: count = count + 1 df = get_filtered_parquet(pq, columns, filters, use_transition=use_transition, parameters=parameters) df = df.groupby(CASE_CONCEPT_NAME).first() if len(df) > max_no_of_points_to_sample: df = df.sample(n=max_no_of_points_to_sample) date_values = [ x.timestamp() for x in list(df[DEFAULT_TIMESTAMP_KEY]) ] overall_list = overall_list + date_values if count >= no_samples: break overall_list = sorted(overall_list) if len(overall_list) > max_no_of_points_to_sample: overall_list = points_subset.pick_chosen_points_list( max_no_of_points_to_sample, overall_list) return overall_list
def get_case_duration(path, log_name, managed_logs, parameters=None): if parameters is None: parameters = {} no_samples = parameters[ PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES use_transition = parameters[ PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION activity_key = DEFAULT_NAME_KEY if not use_transition else "@@classifier" filters = parameters[FILTERS] if FILTERS in parameters else [] parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key max_no_of_points_to_sample = parameters[ "max_no_of_points_to_sample"] if "max_no_of_points_to_sample" in parameters else 100000 folder = os.path.join(path, log_name) columns = get_columns_to_import(filters, [CASE_CONCEPT_NAME, DEFAULT_TIMESTAMP_KEY], use_transition=use_transition) parquet_list = parquet_importer.get_list_parquet(folder) overall_list = [] count = 0 for index, pq in enumerate(parquet_list): pq_basename = Path(pq).name if pq_basename in managed_logs: count = count + 1 df = get_filtered_parquet(pq, columns, filters, use_transition=use_transition, parameters=parameters) cases = case_statistics.get_cases_description(df, parameters=parameters) duration_values = [x["caseDuration"] for x in cases.values()] overall_list = overall_list + duration_values if count >= no_samples: break overall_list = sorted(overall_list) if len(overall_list) > max_no_of_points_to_sample: overall_list = points_subset.pick_chosen_points_list( max_no_of_points_to_sample, overall_list) return overall_list
def get_kde_date_attribute(values, parameters=None): """ Gets the KDE estimation for the distribution of a date attribute values Parameters ------------- values Values of the date attribute value parameters Possible parameters of the algorithm, including: graph_points -> number of points to include in the graph Returns -------------- x X-axis values to represent y Y-axis values to represent """ if pkgutil.find_loader("scipy") and pkgutil.find_loader( "numpy") and pkgutil.find_loader("pandas"): from scipy.stats import gaussian_kde import numpy as np import pandas as pd if parameters is None: parameters = {} graph_points = exec_utils.get_param_value(Parameters.GRAPH_POINTS, parameters, 200) points_to_sample = exec_utils.get_param_value( Parameters.POINT_TO_SAMPLE, parameters, 400) red_values = pick_chosen_points_list(points_to_sample, values) int_values = sorted( [x.replace(tzinfo=None).timestamp() for x in red_values]) density = gaussian_kde(int_values) xs = np.linspace(min(int_values), max(int_values), graph_points) xs_transf = pd.to_datetime(xs * 10**9) return [xs_transf, density(xs)] else: msg = "scipy is not available. graphs cannot be built!" logging.error(msg) raise Exception(msg)
def apply( dataframe: pd.DataFrame, list_activities: List[str], sample_size: int, parameters: Optional[Dict[Union[str, Parameters], Any]] = None ) -> Dict[str, Any]: """ Finds the disconnected performance spectrum provided a dataframe and a list of activities Parameters ------------- dataframe Dataframe list_activities List of activities interesting for the performance spectrum (at least two) sample_size Size of the sample parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY - Parameters.TIMESTAMP_KEY - Parameters.CASE_ID_KEY Returns ------------- points Points of the performance spectrum """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY) sort_log_required = exec_utils.get_param_value( Parameters.SORT_LOG_REQUIRED, parameters, True) dataframe = dataframe[[case_id_glue, activity_key, timestamp_key]] dataframe = dataframe[dataframe[activity_key].isin(list_activities)] dataframe = pandas_utils.insert_index(dataframe, constants.DEFAULT_EVENT_INDEX_KEY) if sort_log_required: dataframe = dataframe.sort_values( [case_id_glue, timestamp_key, constants.DEFAULT_EVENT_INDEX_KEY]) dataframe[timestamp_key] = dataframe[timestamp_key].astype( np.int64) / 10**9 all_patterns = [(len(list_activities) - i, gen_patterns(list_activities, len(list_activities) - i)) for i in range(len(list_activities) - 1)] def key(k, n): return k + str(n) def to_points(match, l): return { 'case_id': match[key(case_id_glue, 0)], 'points': [(match[key(activity_key, i)], match[key(timestamp_key, i)]) for i in range(l)] } points = [] for l, patterns in all_patterns: # concat shifted and suffixed dataframes to get a dataframe that allows to check for the patterns dfs = [dataframe.add_suffix(str(i)).shift(-i) for i in range(l)] df_merged = pd.concat(dfs, axis=1) indices = [shift_index(dfs[i].index, i) for i in range(len(dfs))] mindex = pd.MultiIndex.from_arrays(indices) df_merged = df_merged.set_index(mindex) for i in range(l - 1): df_merged = df_merged[df_merged[key(case_id_glue, i)] == df_merged[ key(case_id_glue, i + 1)]] column_list = [key(activity_key, i) for i in range(l)] matches = df_merged[np.isin(df_merged[column_list].sum(axis=1), patterns)] points.extend([to_points(m, l) for m in matches.to_dict('records')]) # drop rows of this match to not discover subsets of this match again dataframe = dataframe.drop( [int(i) for indices in matches.index for i in indices[:-1]]) pass points = sorted(points, key=lambda x: min(x['points'], key=lambda x: x[1])[1]) if len(points) > sample_size: points = points_subset.pick_chosen_points_list(sample_size, points) return points
def __init__(self, trace: Trace, sync_net: PetriNet, sync_im: Marking, sync_fm: Marking, parameters: Optional[Dict[Any, Any]] = None): """ Constructor Parameters --------------- trace Trace sync_net Synchronous product net sync_im Initial marking sync_fm Final marking parameters Parameters of the algorithm, including: - Parameters.CASE_ID_KEY => attribute to use as case identifier - Parameters.ACTIVITY_KEY => attribute to use as activity - Parameters.COSTS => (if provided) the cost function (otherwise the default cost function is applied) - Parameters.SPLIT_IDX => (if provided) the split points as indices of elements of the trace (e.g. for ["A", "B", "C", "D", "E"], specifying [1,3] as split points means splitting at "B" and "D"). If not provided, some split points at uniform distances are found. - Parameters.MAX_K_VALUE => the maximum number of split points that is allowed (trim the specified indexes if necessary). - Parameters.INCIDENCE_MATRIX => (if provided) the incidence matrix associated to the sync product net - Parameters.A => (if provided) the A numpy matrix of the incidence matrix - Parameters.CONSUMPTION_MATRIX => (if provided) the consumption matrix associated to the sync product net - Parameters.C => (if provided) the C numpy matrix of the consumption matrix - Parameters.FULL_BOOTSTRAP_REQUIRED => The preset/postset of places/transitions need to be inserted """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) max_k_value = exec_utils.get_param_value(Parameters.MAX_K_VALUE, parameters, 5) costs = exec_utils.get_param_value(Parameters.COSTS, parameters, None) split_idx = exec_utils.get_param_value(Parameters.SPLIT_IDX, parameters, None) self.full_bootstrap_required = exec_utils.get_param_value(Parameters.FULL_BOOTSTRAP_REQUIRED, parameters, True) self.trace = [x[activity_key] for x in trace] if costs is None: costs = align_utils.construct_standard_cost_function(sync_net, align_utils.SKIP) if split_idx is None: split_idx = [i for i in range(1, len(trace))] self.split_idx = split_idx if len(self.split_idx) > max_k_value: self.split_idx = points_subset.pick_chosen_points_list(max_k_value, self.split_idx) self.k = len(self.split_idx) if len(self.split_idx) > 1 else 2 self.sync_net = sync_net self.ini = sync_im self.fin = sync_fm self.costs = costs self.incidence_matrix = exec_utils.get_param_value(Parameters.INCIDENCE_MATRIX, parameters, IncidenceMatrix(self.sync_net)) self.consumption_matrix = exec_utils.get_param_value(Parameters.CONSUMPTION_MATRIX, parameters, ConsumptionMatrix(self.sync_net)) self.A = exec_utils.get_param_value(Parameters.A, parameters, np.asmatrix(self.incidence_matrix.a_matrix)) self.C = exec_utils.get_param_value(Parameters.C, parameters, np.asmatrix(self.consumption_matrix.c_matrix)) self.__build_entities()
def apply( log: EventLog, list_activities: List[str], sample_size: int, parameters: Optional[Dict[Union[str, Parameters], Any]] = None ) -> Dict[str, Any]: """ Finds the disconnected performance spectrum provided a log and a list of activities Parameters ------------- log Log list_activities List of activities interesting for the performance spectrum (at least two) sample_size Size of the sample parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY - Parameters.TIMESTAMP_KEY Returns ------------- points Points of the performance spectrum """ if parameters is None: parameters = {} sort_log_required = exec_utils.get_param_value( Parameters.SORT_LOG_REQUIRED, parameters, True) all_acti_combs = set( tuple(list_activities[j:j + i]) for i in range(2, len(list_activities) + 1) for j in range(0, len(list_activities) - i + 1)) two_acti_combs = set((list_activities[i], list_activities[i + 1]) for i in range(len(list_activities) - 1)) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY) case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, xes.DEFAULT_TRACEID_KEY) parameters[Parameters.ATTRIBUTE_KEY] = activity_key log = basic_filter.filter_log_events_attr(log, list_activities, parameters=parameters) if sort_log_required: log = sorting.sort_timestamp_log(log, timestamp_key=timestamp_key) points = [] for trace in log: matches = [(i, i + 1) for i in range(len(trace) - 1) if (trace[i][activity_key], trace[i + 1][activity_key]) in two_acti_combs] i = 0 while i < len(matches) - 1: matchAct = (trace[mi][activity_key] for mi in (matches[i] + matches[i + 1][1:])) if matches[i][-1] == matches[i + 1][0] and matchAct in all_acti_combs: matches[i] = matches[i] + matches[i + 1][1:] del matches[i + 1] i = 0 else: i += 1 if matches: matches = set(matches) timest_comb = [{ 'points': [(trace[i][activity_key], trace[i][timestamp_key].timestamp()) for i in match] } for match in matches] for p in timest_comb: p['case_id'] = trace.attributes[case_id_key] points += timest_comb points = sorted(points, key=lambda x: min(x['points'], key=lambda x: x[1])[1]) if len(points) > sample_size: points = points_subset.pick_chosen_points_list(sample_size, points) return points
def apply( log: EventLog, list_activities: List[str], sample_size: int, parameters: Optional[Dict[Union[str, Parameters], Any]] = None ) -> Dict[str, Any]: """ Finds the performance spectrum provided a log and a list of activities Parameters ------------- log Log list_activities List of activities interesting for the performance spectrum (at least two) sample_size Size of the sample parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY - Parameters.TIMESTAMP_KEY Returns ------------- points Points of the performance spectrum """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY) sort_log_required = exec_utils.get_param_value( Parameters.SORT_LOG_REQUIRED, parameters, True) parameters[Parameters.ATTRIBUTE_KEY] = activity_key log = basic_filter.filter_log_events_attr(log, list_activities, parameters=parameters) if sort_log_required: log = sorting.sort_timestamp_log(log, timestamp_key=timestamp_key) points = [] for trace in log: for i in range(len(trace) - len(list_activities) + 1): acti_comb = [ event[activity_key] for event in trace[i:i + len(list_activities)] ] if acti_comb == list_activities: timest_comb = [ event[timestamp_key].timestamp() for event in trace[i:i + len(list_activities)] ] points.append(timest_comb) points = sorted(points, key=lambda x: x[0]) if len(points) > sample_size: points = points_subset.pick_chosen_points_list(sample_size, points) return points