def test_importExportCSVtoCSV(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" df = pd.read_csv(os.path.join(INPUT_DATA_DIR, "running-example.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) event_log = log_conversion.apply( df, variant=log_conversion.TO_EVENT_STREAM) event_log = sorting.sort_timestamp(event_log) event_log = sampling.sample(event_log) event_log = index_attribute.insert_event_index_as_event_attribute( event_log) log = log_conversion.apply(event_log) log = sorting.sort_timestamp(log) log = sampling.sample(log) log = index_attribute.insert_trace_index_as_event_attribute(log) event_log_transformed = log_conversion.apply( log, variant=log_conversion.TO_EVENT_STREAM) df = log_conversion.apply(event_log_transformed, variant=log_conversion.TO_DATA_FRAME) df.to_csv(os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv")) df = pd.read_csv( os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) event_log_imported_after_export = log_conversion.apply( df, variant=log_conversion.TO_EVENT_STREAM) log_imported_after_export = log_conversion.apply( event_log_imported_after_export) self.assertEqual(len(log), len(log_imported_after_export)) os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv"))
def apply(file_path, return_obj_dataframe=False, parameters=None): if parameters is None: parameters = {} db = sqlite3.connect(file_path) df = pd.read_sql_query("SELECT * FROM event_log", db) col = list(df.columns) rep_dict = {} for x in col: if x.startswith("case_"): rep_dict[x] = x.split("case_")[1] else: rep_dict[x] = "event_" + x df = df.rename(columns=rep_dict) df = df.dropna(subset=["event_id"]) df = df.dropna(subset=["event_activity"]) df = df.dropna(subset=["event_timestamp"]) df = dataframe_utils.convert_timestamp_columns_in_df(df) """print(df) ot_columns = [x for x in df.columns if not x.startswith("event_")] for ot in ot_columns: df[ot] = df[ot].apply(lambda x: eval(x)) print(df)""" if return_obj_dataframe: ot_df = pd.read_sql_query("SELECT * FROM object_types", db) ot_df = ot_df.dropna(subset=["NAME"]) OT = list(ot_df["NAME"]) obj_df_list = [] for ot in OT: o_df = pd.read_sql_query("SELECT * FROM " + ot, db) col = list(o_df.columns) rep_dict = {} for x in col: rep_dict[x] = "object_" + x o_df = o_df.rename(columns=rep_dict) o_df = o_df.dropna(subset=["object_id"]) o_df["object_type"] = ot o_df = dataframe_utils.convert_timestamp_columns_in_df(o_df) obj_df_list.append(o_df) obj_df = pd.concat(obj_df_list) return df, obj_df return df
def execute_script(): df = pd.read_csv("../tests/input_data/interval_event_log.csv") df = dataframe_utils.convert_timestamp_columns_in_df(df) act_count = dict(df["concept:name"].value_counts()) parameters = {} parameters[ constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY] = "start_timestamp" parameters[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = "time:timestamp" parameters["format"] = "svg" start_activities = sa_get.get_start_activities(df, parameters=parameters) end_activities = ea_get.get_end_activities(df, parameters=parameters) parameters["start_activities"] = start_activities parameters["end_activities"] = end_activities soj_time = soj_time_get.apply(df, parameters=parameters) dfg, performance_dfg = correlation_miner.apply( df, variant=correlation_miner.Variants.CLASSIC, parameters=parameters) gviz_freq = dfg_vis.apply(dfg, activities_count=act_count, soj_time=soj_time, variant=dfg_vis.Variants.FREQUENCY, parameters=parameters) dfg_vis.view(gviz_freq) gviz_perf = dfg_vis.apply(performance_dfg, activities_count=act_count, soj_time=soj_time, variant=dfg_vis.Variants.PERFORMANCE, parameters=parameters) dfg_vis.view(gviz_perf)
def test_passedtime_prepost_df(self): df = pd.read_csv(os.path.join("input_data", "running-example.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) prepost = df_passed_time.apply(df, "decide", variant=df_passed_time.Variants.PREPOST) del prepost
def apply(net, initial_marking=None, final_marking=None, log=None, aggregated_statistics=None, parameters=None, variant=Variants.WO_DECORATION): if parameters is None: parameters = {} if log is not None: if pkgutil.find_loader("pandas"): import pandas from pm4py.objects.log.util import dataframe_utils if isinstance(log, pandas.core.frame.DataFrame): log = dataframe_utils.convert_timestamp_columns_in_df(log) log = log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG) return exec_utils.get_variant(variant).apply( net, initial_marking, final_marking, log=log, aggregated_statistics=aggregated_statistics, parameters=parameters)
def test_attrValueDifferentPersons_neg(self): df = pd.read_csv(os.path.join("input_data", "running-example.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) attr_value_different_persons_neg = ltl_checker.attr_value_different_persons( df, "check ticket", parameters={ltl_checker.Parameters.POSITIVE: False})
def test_filtering_traces_attribute_in_timeframe(self): input_log = os.path.join(INPUT_DATA_DIR, "receipt.csv") df = pd.read_csv(input_log) df = dataframe_utils.convert_timestamp_columns_in_df(df) df1 = timestamp_filter.filter_traces_attribute_in_timeframe( df, "concept:name", "Confirmation of receipt", "2011-03-09 00:00:00", "2012-01-18 23:59:59")
def test_heuplusplus_perf_df(self): df = pd.read_csv(os.path.join(INPUT_DATA_DIR, "interval_event_log.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) heu_net = heuristics_miner.Variants.PLUSPLUS.value.apply_heu_pandas( df, parameters={"heu_net_decoration": "performance"}) gviz = hn_vis.apply(heu_net)
def test_performance_spectrum(self): log = xes_importer.apply(os.path.join("input_data", "running-example.xes")) from pm4py.algo.discovery.performance_spectrum import algorithm as pspectrum ps = pspectrum.apply(log, ["register request", "decide"]) df = pd.read_csv(os.path.join("input_data", "running-example.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) ps = pspectrum.apply(df, ["register request", "decide"])
def test_res_profiles_df(self): from pm4py.algo.organizational_mining.resource_profiles import algorithm log = pd.read_csv( os.path.join("..", "tests", "input_data", "running-example.csv")) log = dataframe_utils.convert_timestamp_columns_in_df(log) algorithm.distinct_activities(log, "2010-12-30 00:00:00", "2011-01-25 00:00:00", "Sara") algorithm.activity_frequency(log, "2010-12-30 00:00:00", "2011-01-25 00:00:00", "Sara", "decide") algorithm.activity_completions(log, "2010-12-30 00:00:00", "2011-01-25 00:00:00", "Sara") algorithm.case_completions(log, "2010-12-30 00:00:00", "2011-01-25 00:00:00", "Pete") algorithm.fraction_case_completions(log, "2010-12-30 00:00:00", "2011-01-25 00:00:00", "Pete") algorithm.average_workload(log, "2010-12-30 00:00:00", "2011-01-15 00:00:00", "Mike") algorithm.multitasking(log, "2010-12-30 00:00:00", "2011-01-25 00:00:00", "Mike") algorithm.average_duration_activity(log, "2010-12-30 00:00:00", "2011-01-25 00:00:00", "Sue", "examine thoroughly") algorithm.average_case_duration(log, "2010-12-30 00:00:00", "2011-01-25 00:00:00", "Sue") algorithm.interaction_two_resources(log, "2010-12-30 00:00:00", "2011-01-25 00:00:00", "Mike", "Pete") algorithm.social_position(log, "2010-12-30 00:00:00", "2011-01-25 00:00:00", "Sue")
def test_performance_spectrum_df(self): df = pd.read_csv(os.path.join("input_data", "receipt.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) pspectr = df_pspectrum.apply(df, [ "T02 Check confirmation of receipt", "T03 Adjust confirmation of receipt" ], 1000, {})
def test_AeventuallyBeventuallyC_neg(self): df = pd.read_csv(os.path.join("input_data", "running-example.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) filt_A_ev_B_ev_C_neg = ltl_checker.A_eventually_B_eventually_C(df, "check ticket", "decide", "pay compensation", parameters={ ltl_checker.Parameters.POSITIVE: False})
def apply(net: PetriNet, initial_marking: Marking = None, final_marking: Marking = None, log: Union[EventLog, EventStream, pd.DataFrame] = None, aggregated_statistics=None, parameters: Optional[Dict[Any, Any]] = None, variant=Variants.WO_DECORATION) -> graphviz.Digraph: if parameters is None: parameters = {} if log is not None: if pkgutil.find_loader("pandas"): import pandas from pm4py.objects.log.util import dataframe_utils if isinstance(log, pandas.core.frame.DataFrame): log = dataframe_utils.convert_timestamp_columns_in_df(log) log = log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG) return exec_utils.get_variant(variant).apply( net, initial_marking, final_marking, log=log, aggregated_statistics=aggregated_statistics, parameters=parameters)
def test_fourEeyesPrinciple_neg(self): df = pd.read_csv(os.path.join("input_data", "running-example.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) filt_foureyes_neg = ltl_checker.four_eyes_principle( df, "check ticket", "pay compensation", parameters={ltl_checker.Parameters.POSITIVE: False})
def execute_script(): dataframe = pd.read_csv("../tests/input_data/receipt.csv") dataframe = dataframe_utils.convert_timestamp_columns_in_df(dataframe) tf = temporal_profile_discovery.apply(dataframe) conformance = temporal_profile_conformance.apply(dataframe, tf, parameters={"zeta": 6.0}) for index, dev in enumerate(conformance): if len(dev) > 0: print(index, dev)
def test_efg_pandas(self): import pm4py import pandas as pd dataframe = pd.read_csv(os.path.join("input_data", "interval_event_log.csv")) from pm4py.objects.log.util import dataframe_utils dataframe = dataframe_utils.convert_timestamp_columns_in_df(dataframe) from pm4py.statistics.eventually_follows.pandas import get efg = get.apply(dataframe, parameters={get.Parameters.START_TIMESTAMP_KEY: "start_timestamp"})
def format_dataframe(df: pd.DataFrame, case_id: str = constants.CASE_CONCEPT_NAME, activity_key: str = xes_constants.DEFAULT_NAME_KEY, timestamp_key: str = xes_constants.DEFAULT_TIMESTAMP_KEY, timest_format: Optional[str] = None) -> pd.DataFrame: """ Give the appropriate format on the dataframe, for process mining purposes Parameters -------------- df Dataframe case_id Case identifier column activity_key Activity column timestamp_key Timestamp column timest_format Timestamp format that is provided to Pandas Returns -------------- df Dataframe """ from pm4py.objects.log.util import dataframe_utils if case_id not in df.columns: raise Exception(case_id + " column (case ID) is not in the dataframe!") if activity_key not in df.columns: raise Exception(activity_key + " column (activity) is not in the dataframe!") if timestamp_key not in df.columns: raise Exception(timestamp_key + " column (timestamp) is not in the dataframe!") df = df.rename( columns={ case_id: constants.CASE_CONCEPT_NAME, activity_key: xes_constants.DEFAULT_NAME_KEY, timestamp_key: xes_constants.DEFAULT_TIMESTAMP_KEY }) df[constants.CASE_CONCEPT_NAME] = df[constants.CASE_CONCEPT_NAME].astype( str) # makes sure that the timestamp column is of timestamp type df = dataframe_utils.convert_timestamp_columns_in_df( df, timest_format=timest_format, timest_columns=[xes_constants.DEFAULT_TIMESTAMP_KEY]) # set an index column df = pandas_utils.insert_index(df, INDEX_COLUMN) # sorts the dataframe df = df.sort_values([ constants.CASE_CONCEPT_NAME, xes_constants.DEFAULT_TIMESTAMP_KEY, INDEX_COLUMN ]) # logging.warning( # "please convert the dataframe for advanced process mining applications. log = pm4py.convert_to_event_log(df)") return df
def test_AnextBnextC_pos(self): df = pd.read_csv(os.path.join("input_data", "running-example.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) filt_A_next_B_next_C_pos = ltl_checker.A_next_B_next_C( df, "check ticket", "decide", "pay compensation", parameters={ltl_checker.Parameters.POSITIVE: True})
def test_petrinet_receipt_df(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" df = pd.read_csv(os.path.join(INPUT_DATA_DIR, "receipt.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) net, im, fm = heuristics_miner.apply(df) gviz = pn_vis.apply(net, im, fm) del gviz
def test_autofiltering_dataframe(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" input_log = os.path.join(INPUT_DATA_DIR, "running-example.csv") dataframe = pd.read_csv(input_log) dataframe = dataframe_utils.convert_timestamp_columns_in_df(dataframe) dataframe = auto_filter.apply_auto_filter(dataframe) del dataframe
def csv_loghandler(self): """ Loads the CSV type log from the path :return: The log """ log_csv = pd.read_csv(self.pathlog, sep=',') log_csv = dataframe_utils.convert_timestamp_columns_in_df(log_csv) log_csv = log_csv.sort_values('<timestamp_column>') event_log = log_converter.apply(log_csv) return event_log
def test_attr_value_repetition(self): from pm4py.algo.filtering.pandas.attr_value_repetition import filter df = pd.read_csv(os.path.join("input_data", "running-example.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) filtered_df = filter.apply( df, "Sara", parameters={ constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "org:resource" })
def test_pdimp_xesexp(self): log0 = pd.read_csv(os.path.join("input_data", "running-example.csv")) log0 = dataframe_utils.convert_timestamp_columns_in_df(log0) log = log_conversion.apply(log0, variant=log_conversion.TO_EVENT_LOG) stream = log_conversion.apply(log0, variant=log_conversion.TO_EVENT_STREAM) df = log_conversion.apply(log0, variant=log_conversion.TO_DATA_FRAME) xes_exporter.apply(log, "ru.xes") xes_exporter.apply(stream, "ru.xes") xes_exporter.apply(df, "ru.xes") os.remove('ru.xes')
def test_dfCasedurationPlotSemilogx(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" df = pd.read_csv(os.path.join("input_data", "receipt.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) x, y = pd_case_statistics.get_kde_caseduration(df) json = pd_case_statistics.get_kde_caseduration_json(df) del json
def test_footprints_tree_df(self): df = pd.read_csv(os.path.join("input_data", "running-example.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) from pm4py.algo.discovery.inductive import algorithm as inductive_miner log = converter.apply(df) tree = inductive_miner.apply_tree(log) from pm4py.algo.discovery.footprints import algorithm as footprints_discovery fp_df = footprints_discovery.apply(df) fp_tree = footprints_discovery.apply(tree) from pm4py.algo.conformance.footprints import algorithm as footprints_conformance conf = footprints_conformance.apply(fp_df, fp_tree)
def execute_script(): df = pd.read_csv("../tests/input_data/receipt.csv") df = dataframe_utils.convert_timestamp_columns_in_df(df) act_count = dict(df["concept:name"].value_counts()) dfg, performance_dfg = correlation_miner.apply(df, variant=correlation_miner.Variants.CLASSIC) gviz_freq = dfg_vis.apply(dfg, activities_count=act_count, variant=dfg_vis.Variants.FREQUENCY, parameters={"format": "svg"}) dfg_vis.view(gviz_freq) gviz_perf = dfg_vis.apply(performance_dfg, activities_count=act_count, variant=dfg_vis.Variants.PERFORMANCE, parameters={"format": "svg"}) dfg_vis.view(gviz_perf)
def test_filtering_variants(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" input_log = os.path.join(INPUT_DATA_DIR, "running-example.csv") dataframe = pd.read_csv(input_log) dataframe = dataframe_utils.convert_timestamp_columns_in_df(dataframe) variants = case_statistics.get_variant_statistics(dataframe) chosen_variants = [variants[0]["variant"]] dataframe = variants_filter.apply(dataframe, chosen_variants) del dataframe
def test_inductiveminer_df(self): log = pd.read_csv(os.path.join("input_data", "running-example.csv")) log = dataframe_utils.convert_timestamp_columns_in_df(log) net, im, fm = inductive_miner.apply(log) aligned_traces_tr = tr_alg.apply(log, net, im, fm) aligned_traces_alignments = align_alg.apply(log, net, im, fm) evaluation = eval_alg.apply(log, net, im, fm) fitness = rp_fit.apply(log, net, im, fm) precision = precision_evaluator.apply(log, net, im, fm) gen = generalization.apply(log, net, im, fm) sim = simplicity.apply(net)
def test_inductiveminer_stream(self): df = pd.read_csv(os.path.join("input_data", "running-example.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) stream = log_conversion.apply(df, variant=log_conversion.TO_EVENT_STREAM) net, im, fm = inductive_miner.apply(stream) aligned_traces_tr = tr_alg.apply(stream, net, im, fm) aligned_traces_alignments = align_alg.apply(stream, net, im, fm) evaluation = eval_alg.apply(stream, net, im, fm) fitness = rp_fit.apply(stream, net, im, fm) precision = precision_evaluator.apply(stream, net, im, fm) gen = generalization.apply(stream, net, im, fm) sim = simplicity.apply(net)
def test_dfDateAttribute(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" df = pd.read_csv(os.path.join("input_data", "roadtraffic100traces.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) x, y = pd_attributes_filter.get_kde_date_attribute(df) json = pd_attributes_filter.get_kde_date_attribute_json(df) del json