Exemple #1
0
def apply(
    log: Union[EventLog, EventStream, pd.DataFrame],
    parameters: Optional[Dict[Union[str, Parameters], Any]] = None
) -> Tuple[Dict[Tuple[str, str], int], Dict[Tuple[str, str], float]]:
    """
    Applies the correlation miner (splits the log in smaller chunks)

    Parameters
    ---------------
    log
        Log object
    parameters
        Parameters of the algorithm

    Returns
    ---------------
    dfg
        Frequency DFG
    performance_dfg
        Performance DFG
    """
    if parameters is None:
        parameters = {}

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_constants.DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    sample_size = exec_utils.get_param_value(Parameters.SAMPLE_SIZE,
                                             parameters, 100000)

    PS_matrixes = []
    duration_matrixes = []

    if type(log) is pd.DataFrame:
        # keep only the two columns before conversion
        log = log[list(set([activity_key, timestamp_key,
                            start_timestamp_key]))]
        log = log.sort_values([timestamp_key, start_timestamp_key])
        activities_counter = dict(log[activity_key].value_counts())
        activities = sorted(list(activities_counter.keys()))
    else:
        log = converter.apply(log,
                              variant=converter.Variants.TO_EVENT_STREAM,
                              parameters={
                                  "deepcopy": False,
                                  "include_case_attributes": False
                              })
        activities_counter = Counter(x[activity_key] for x in log)
        activities = sorted(list(activities_counter.keys()))

    prev = 0
    while prev < len(log):
        sample = log[prev:min(len(log), prev + sample_size)]
        transf_stream, activities_grouped, activities = classic.preprocess_log(
            sample, activities=activities, parameters=parameters)
        PS_matrix, duration_matrix = classic.get_PS_dur_matrix(
            activities_grouped, activities, parameters=parameters)
        PS_matrixes.append(PS_matrix)
        duration_matrixes.append(duration_matrix)

        prev = prev + sample_size

    PS_matrix = np.zeros((len(activities), len(activities)))
    duration_matrix = np.zeros((len(activities), len(activities)))
    z = 0
    while z < len(PS_matrixes):
        PS_matrix = PS_matrix + PS_matrixes[z]
        duration_matrix = np.maximum(duration_matrix, duration_matrixes[z])
        z = z + 1
    PS_matrix = PS_matrix / float(len(PS_matrixes))

    return classic.resolve_lp_get_dfg(PS_matrix, duration_matrix, activities,
                                      activities_counter)
Exemple #2
0
def correlation_miner(path, log_name, managed_logs, parameters=None):
    if parameters is None:
        parameters = {}

    activities = parameters["activities"] if "activities" in parameters else None
    complete_timestamp = parameters["complete_timestamp"] if "complete_timestamp" in parameters else DEFAULT_TIMESTAMP_KEY
    start_timestamp = parameters["start_timestamp"] if "start_timestamp" in parameters else DEFAULT_TIMESTAMP_KEY

    from pm4py.algo.discovery.correlation_mining.variants import classic

    no_samples = parameters[PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES
    use_transition = parameters[
        PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION
    activity_key = DEFAULT_NAME_KEY if not use_transition else PARAMETER_PM4PYWS_CLASSIFIER
    filters = parameters[FILTERS] if FILTERS in parameters else []
    parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key
    parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key
    parameters[classic.Parameters.ACTIVITY_KEY] = activity_key
    parameters[classic.Parameters.TIMESTAMP_KEY] = complete_timestamp
    parameters[classic.Parameters.START_TIMESTAMP_KEY] = start_timestamp

    folder = os.path.join(path, log_name)
    columns = get_columns_to_import(filters, [activity_key, complete_timestamp, start_timestamp], use_transition=use_transition)

    parquet_list = parquet_importer.get_list_parquet(folder)

    PS_matrixes = []
    duration_matrixes = []

    count = 0
    for index, pq in enumerate(parquet_list):
        pq_basename = Path(pq).name
        if pq_basename in managed_logs:
            count = count + 1

            df = get_filtered_parquet(pq, columns, filters, use_transition=use_transition, parameters=parameters)

            transf_stream, activities_grouped, activities = classic.preprocess_log(df, activities=activities,
                                                                                   parameters=parameters)

            PS_matrix, duration_matrix = classic.get_PS_dur_matrix(activities_grouped, activities,
                                                                   parameters=parameters)

            PS_matrixes.append(PS_matrix)
            duration_matrixes.append(duration_matrix)

            if count >= no_samples:
                break

    PS_matrix = np.zeros((len(activities), len(activities)))
    duration_matrix = np.zeros((len(activities), len(activities)))

    z = 0
    while z < len(PS_matrixes):
        PS_matrix = PS_matrix + PS_matrixes[z]
        duration_matrix = np.maximum(duration_matrix, duration_matrixes[z])
        z = z + 1
    PS_matrix = PS_matrix / float(len(PS_matrixes))

    PS_matrix = PS_matrix.tolist()
    duration_matrix = duration_matrix.tolist()

    return {"PS_matrix": json.dumps(PS_matrix), "duration_matrix": json.dumps(duration_matrix)}