Exemple #1
0
def filter_start_activities(log, activities, retain=True):
    """
    Filter cases having a start activity in the provided list

    Parameters
    --------------
    log
        Log object
    activities
        List start activities
    retain
        if True, we retain the traces containing the given activities, if false, we drop the traces


    Returns
    --------------
    filtered_log
        Filtered log object
    """
    if check_is_dataframe(log):
        check_dataframe_columns(log)
        from pm4py.algo.filtering.pandas.start_activities import start_activities_filter
        return start_activities_filter.apply(
            log,
            activities,
            parameters={start_activities_filter.Parameters.POSITIVE: retain})
    else:
        from pm4py.algo.filtering.log.start_activities import start_activities_filter
        return start_activities_filter.apply(
            log,
            activities,
            parameters={start_activities_filter.Parameters.POSITIVE: retain})
Exemple #2
0
def filter_start_activities(log: Union[EventLog, pd.DataFrame], activities: Union[Set[str], List[str]], retain: bool = True) -> \
Union[EventLog, pd.DataFrame]:
    """
    Filter cases having a start activity in the provided list

    Parameters
    --------------
    log
        Log object
    activities
        List start activities
    retain
        if True, we retain the traces containing the given activities, if false, we drop the traces


    Returns
    --------------
    filtered_log
        Filtered log object
    """
    if type(log) not in [pd.DataFrame, EventLog, EventStream]:
        raise Exception(
            "the method can be applied only to a traditional event log!")

    parameters = get_properties(log)
    if check_is_pandas_dataframe(log):
        check_pandas_dataframe_columns(log)
        from pm4py.algo.filtering.pandas.start_activities import start_activities_filter
        parameters[start_activities_filter.Parameters.POSITIVE] = retain
        return start_activities_filter.apply(log,
                                             activities,
                                             parameters=parameters)
    else:
        from pm4py.algo.filtering.log.start_activities import start_activities_filter
        parameters[start_activities_filter.Parameters.POSITIVE] = retain
        return start_activities_filter.apply(log,
                                             activities,
                                             parameters=parameters)
Exemple #3
0
def filter_start_activities(log, admitted_start_activities):
    """
    Filter cases having a start activity in the provided list

    Parameters
    --------------
    log
        Log object
    admitted_start_activities
        List of admitted start activities

    Returns
    --------------
    filtered_log
        Filtered log_skeleton object
    """
    if check_is_dataframe(log):
        check_dataframe_columns(log)
        from pm4py.algo.filtering.pandas.start_activities import start_activities_filter
        return start_activities_filter.apply(log, admitted_start_activities)
    else:
        from pm4py.algo.filtering.log.start_activities import start_activities_filter
        return start_activities_filter.apply(log, admitted_start_activities)
Exemple #4
0
def apply(dataframe, filter, parameters=None):
    """
    Apply a filter to the current log (start activities filter)

    Parameters
    ------------
    dataframe
        Pandas dataframe
    filter
        Filter to apply
    parameters
        Parameters of the algorithm

    Returns
    ------------
    dataframe
        Pandas dataframe
    """
    if parameters is None:
        parameters = {}

    return start_activities_filter.apply(dataframe,
                                         filter[1],
                                         parameters=parameters)
Exemple #5
0
def execute_script():
    aa = time.time()
    dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion(
        inputLog, sep=',')
    dataframe = csv_import_adapter.convert_caseid_column_to_str(
        dataframe, case_id_glue=CASEID_GLUE)
    dataframe = csv_import_adapter.convert_timestamp_columns_in_df(
        dataframe, timest_format=TIMEST_FORMAT, timest_columns=TIMEST_COLUMNS)
    dataframe = dataframe.sort_values([CASEID_GLUE, TIMEST_KEY])
    dataframe_fa = attributes_filter.filter_df_keeping_spno_activities(
        dataframe,
        activity_key=ACTIVITY_KEY,
        max_no_activities=MAX_NO_ACTIVITIES)
    bb = time.time()
    print("importing log time=", (bb - aa))

    parameters_cde = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_TIMESTAMP_KEY: TIMEST_KEY,
        "sort_by_column": "caseDuration",
        "sort_ascending": False,
        "max_ret_cases": 1000
    }
    cases_desc = case_statistics.get_cases_description(
        dataframe, parameters=parameters_cde)

    print(cases_desc)
    bb2 = time.time()
    print("calculating and printing cases_desc = ", (bb2 - bb))
    calculate_process_schema_from_df(dataframe_fa, "NOFILTERS_FREQUENCY.svg",
                                     "NOFILTERS_PERFORMANCE.svg")
    GENERATED_IMAGES.append("NOFILTERS_FREQUENCY.svg")
    GENERATED_IMAGES.append("NOFILTERS_PERFORMANCE.svg")
    if DELETE_VARIABLES:
        del dataframe_fa
    cc = time.time()
    print(
        "saving initial Inductive Miner process schema along with frequency metrics=",
        (cc - bb2))

    dataframe_cp = case_filter.filter_on_case_performance(
        dataframe,
        case_id_glue=CASEID_GLUE,
        timestamp_key=TIMEST_KEY,
        min_case_performance=100000,
        max_case_performance=10000000)
    dataframe_cp_fa = attributes_filter.filter_df_keeping_spno_activities(
        dataframe_cp,
        activity_key=ACTIVITY_KEY,
        max_no_activities=MAX_NO_ACTIVITIES)
    dataframe_cp = None
    if DELETE_VARIABLES:
        del dataframe_cp
    calculate_process_schema_from_df(dataframe_cp_fa,
                                     "FILTER_CP_FREQUENCY.svg",
                                     "FILTER_CP_PERFORMANCE.svg")
    GENERATED_IMAGES.append("FILTER_CP_FREQUENCY.svg")
    GENERATED_IMAGES.append("FILTER_CP_PERFORMANCE.svg")
    if DELETE_VARIABLES:
        del dataframe_cp_fa
    dd = time.time()
    print("filtering on case performance and generating process schema=",
          (dd - cc))

    if ENABLE_ATTRIBUTE_FILTER:
        parameters_att = {
            constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
            constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: ATTRIBUTE_TO_FILTER,
            constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ATTRIBUTE_TO_FILTER,
            "positive": True
        }
        dataframe_att = attributes_filter.apply(dataframe,
                                                ATTRIBUTE_VALUES_TO_FILTER,
                                                parameters=parameters_att)
        # dataframe_att = attributes_filter.apply_auto_filter(dataframe, parameters=parameters_att)
        print(
            "all the activities in the log",
            attributes_filter.get_attribute_values(dataframe_att,
                                                   ACTIVITY_KEY))
        dataframe_att_fa = attributes_filter.filter_df_keeping_spno_activities(
            dataframe_att,
            activity_key=ACTIVITY_KEY,
            max_no_activities=MAX_NO_ACTIVITIES)
        if DELETE_VARIABLES:
            del dataframe_att
        calculate_process_schema_from_df(dataframe_att_fa,
                                         "FILTER_ATT_FREQUENCY.svg",
                                         "FILTER_ATT_PERFORMANCE.svg")
        GENERATED_IMAGES.append("FILTER_ATT_FREQUENCY.svg")
        GENERATED_IMAGES.append("FILTER_ATT_PERFORMANCE.svg")
        if DELETE_VARIABLES:
            del dataframe_att_fa
        ee = time.time()
        print("filtering on attribute values and generating process schema=",
              (ee - dd))

    ee = time.time()
    parameters_sa = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY
    }
    parameters_ea = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY
    }
    start_act = start_activities_filter.get_start_activities(
        dataframe, parameters=parameters_sa)
    print("start activities in the log = ", start_act)
    end_act = end_activities_filter.get_end_activities(
        dataframe, parameters=parameters_ea)
    print("end activities in the log = ", end_act)
    ff = time.time()
    print("finding start and end activities along with their count", (ff - ee))

    if ENABLE_STARTACT_FILTER:
        dataframe_sa = start_activities_filter.apply(dataframe,
                                                     STARTACT_TO_FILTER,
                                                     parameters=parameters_sa)
        # dataframe_sa = start_activities_filter.apply_auto_filter(dataframe, parameters=parameters_sa)
        start_act = start_activities_filter.get_start_activities(
            dataframe_sa, parameters=parameters_sa)
        print("start activities in the filtered log = ", start_act)
        dataframe_sa_fa = attributes_filter.filter_df_keeping_spno_activities(
            dataframe_sa,
            activity_key=ACTIVITY_KEY,
            max_no_activities=MAX_NO_ACTIVITIES)
        if DELETE_VARIABLES:
            del dataframe_sa
        calculate_process_schema_from_df(dataframe_sa_fa,
                                         "FILTER_SA_FREQUENCY.svg",
                                         "FILTER_SA_PERFORMANCE.svg")
        GENERATED_IMAGES.append("FILTER_SA_FREQUENCY.svg")
        GENERATED_IMAGES.append("FILTER_SA_PERFORMANCE.svg")
        if DELETE_VARIABLES:
            del dataframe_sa_fa
    gg = time.time()
    if ENABLE_STARTACT_FILTER:
        print("filtering start activities time=", (gg - ff))

    if ENABLE_ENDACT_FILTER:
        dataframe_ea = end_activities_filter.apply(dataframe,
                                                   ENDACT_TO_FILTER,
                                                   parameters=parameters_ea)
        # dataframe_ea = end_activities_filter.apply_auto_filter(dataframe, parameters=parameters_ea)
        end_act = end_activities_filter.get_end_activities(
            dataframe_ea, parameters=parameters_ea)
        print("end activities in the filtered log = ", end_act)
        dataframe_ea_fa = attributes_filter.filter_df_keeping_spno_activities(
            dataframe_ea,
            activity_key=ACTIVITY_KEY,
            max_no_activities=MAX_NO_ACTIVITIES)
        if DELETE_VARIABLES:
            del dataframe_ea
        calculate_process_schema_from_df(dataframe_ea_fa,
                                         "FILTER_EA_FREQUENCY.svg",
                                         "FILTER_EA_PERFORMANCE.svg")
        GENERATED_IMAGES.append("FILTER_EA_FREQUENCY.svg")
        GENERATED_IMAGES.append("FILTER_EA_PERFORMANCE.svg")
        if DELETE_VARIABLES:
            del dataframe_ea_fa
    hh = time.time()
    if ENABLE_ENDACT_FILTER:
        print("filtering end activities time=", (hh - gg))

    if REMOVE_GENERATED_IMAGES:
        for image in GENERATED_IMAGES:
            os.remove(image)
Exemple #6
0
                     "obj_parent": parents[el] if el in parents else "",
                     "obj_type": nodes[el]
                 }
                 fill_event(e)
                 trace.append(Event(e))
                 for s in G.neighbors(el):
                     curr_nodes.append(s)
                     parents[s] = el
             i = i + 1
         trace = insert_missing_events(trace)
         trace = sorted(trace, key=lambda x: x[Shared.timestamp_column])
         trace1 = Trace(trace)
         trace1.attributes["concept:name"] = o
         log.append(trace1)
 df = log_conv_factory.apply(log, variant=log_conv_factory.TO_DATAFRAME)
 df = start_activities_filter.apply(df, [target_type])
 unique_values = set(df[Shared.activity_column].unique())
 activities = {x: x for x in unique_values}
 activities["C"] = "Create Order"
 activities["J"] = "Create Delivery"
 activities["Q"] = "WMS Transfer Order"
 activities["R"] = "Goods Movement"
 activities["M"] = "Create Invoice"
 activities["L"] = "Create Debit Memo Request"
 activities["P"] = "Create Debit Memo"
 activities["U"] = "Create Pro Forma Invoice"
 activities["H"] = "Create Returns Document"
 activities.update(Shared.tcodes)
 df[Shared.activity_column] = df[Shared.activity_column].map(activities)
 df = df.dropna(subset=[Shared.activity_column])
 df = df[[x for x in df.columns if "named:" not in x]]
Exemple #7
0
tstct = tstct[tstct["SPRSL"] == "E"]
tstct = tstct[["TCODE", "TTEXT"]]
stream = tstct.to_dict("r")
for row in stream:
    activities[row["TCODE"]] = row["TTEXT"]
bkpf = pd.read_csv(os.path.join(dir, "bkpf_old.tsv"),
                   sep="\t",
                   dtype={
                       "BELNR": str,
                       "AWKEY": str,
                       "XBLNR": str,
                       "BUKRS": str
                   })
bkpf["time:timestamp"] = bkpf["CPUDT"] + " " + bkpf["CPUTM"]
bkpf["time:timestamp"] = pd.to_datetime(bkpf["time:timestamp"],
                                        format="%d.%m.%Y %H:%M:%S")
bkpf["case:concept:name"] = "C_" + bkpf["BELNR"]
bkpf["concept:name"] = bkpf["TCODE"].map(activities)
bkpf["org:resource"] = bkpf["USNAM"]
bkpf = bkpf.dropna(subset=["concept:name"])
bkpf = bkpf.dropna(subset=["org:resource"])
bkpf = bkpf.sort_values("time:timestamp")
bkpf = bkpf.reset_index()
bkpf = bkpf[[x for x in bkpf.columns if not "named:" in x]]
#print(start_activities_filter.get_start_activities(bkpf))
bkpf = start_activities_filter.apply(bkpf, ["Create Billing Document"])
#print(end_activities_filter.get_end_activities(bkpf))
bkpf = end_activities_filter.apply(bkpf, ["Post Document"])
parquet_exporter.apply(bkpf, "bkpf.parquet")
bkpf.to_csv("bkpf.csv", index=False)