Example #1
0
 def test_performance_spectrum_df(self):
     log = csv_importer.import_dataframe_from_path(
         os.path.join("input_data", "receipt.csv"))
     pspectr = df_pspectrum.apply(log, [
         "T02 Check confirmation of receipt",
         "T03 Adjust confirmation of receipt"
     ], 1000, {})
Example #2
0
 def test_importing_csv(self):
     from pm4py.objects.log.importer.csv import factory as csv_importer
     df = csv_importer.import_dataframe_from_path(
         os.path.join("input_data", "running-example.csv"))
     df = csv_importer.import_dataframe_from_path_wo_timeconversion(
         os.path.join("input_data", "running-example.csv"))
     stream = csv_importer.apply(
         os.path.join("input_data", "running-example.csv"))
     stru = "case:concept:name,concept:name,time:timestamp\nA1,A,1970-01-01 01:00:00\n"
     df = csv_importer.import_dataframe_from_csv_string(stru)
     stream = csv_importer.import_log_from_string(stru)
Example #3
0
def transform_simple(source_path, target_path, activity_key=xes.DEFAULT_NAME_KEY, timestamp_key=xes.DEFAULT_TIMESTAMP_KEY, caseid_key=CASE_CONCEPT_NAME, parameters=None):
    """
    Transform a list of CSV into a list of corresponding Parquet files (to create a distributed Parquet dataset)
    with the assumption that the events of the same case ID belongs to the same CSV

    Parameters
    -------------
    source_path
        Source path (several CSV dataset)
    target_path
        Target path (distributed Parquet dataset)
    activity_key
        Column that is the activity
    timestamp_key
        Column that is the timestamp
    caseid_key
        Column that is the case ID
    parameters
        Possible parameters of the algorithm, including:
            - sep: the separator
            - quotechar: the quotechar
            - encoding: the encoding
            - timest_columns: the list of column that contain timestamp
            - timest_format: the format of ALL the timest_columns
    """
    if parameters is None:
        parameters = {}

    # create the folder
    try:
        os.mkdir(target_path)
    except:
        pass

    # iterate one-by-one on the files of the source folder
    files = os.listdir(source_path)
    for index, file in enumerate(files):
        if file.lower().endswith("csv"):
            sp = os.path.join(source_path, file)
            df = csv_importer.import_dataframe_from_path(sp, parameters=parameters)
            if activity_key != xes.DEFAULT_NAME_KEY and xes.DEFAULT_NAME_KEY not in df.columns:
                df[xes.DEFAULT_NAME_KEY] = df[activity_key]
            if timestamp_key != xes.DEFAULT_TIMESTAMP_KEY and xes.DEFAULT_TIMESTAMP_KEY not in df.columns:
                df[xes.DEFAULT_TIMESTAMP_KEY] = df[timestamp_key]
            if caseid_key != CASE_CONCEPT_NAME  and CASE_CONCEPT_NAME not in df.columns:
                df[CASE_CONCEPT_NAME] = df[caseid_key]
            tp = os.path.join(target_path, str(index)+".parquet")
            df = df.sort_values([caseid_key, timestamp_key])
            parquet_exporter.apply(df, tp)
Example #4
0
def transform_csv_dataset_to_parquet_distr_dataset(source_path, target_path, target_num_partitions, activity_key=xes.DEFAULT_NAME_KEY, timestamp_key=xes.DEFAULT_TIMESTAMP_KEY, caseid_key=CASE_CONCEPT_NAME, parameters=None):
    """
    Transforms the CSV dataset to a Parquet distributed dataset

    Parameters
    -------------
    source_path
        Source path (several CSV dataset)
    target_path
        Target path (distributed Parquet dataset)
    target_num_partitions
        Target number of partitions (number of divisions of the output)
    activity_key
        Column that is the activity
    timestamp_key
        Column that is the timestamp
    caseid_key
        Column that is the case ID
    parameters
        Possible parameters of the algorithm, including:
            - sep: the separator
            - quotechar: the quotechar
            - encoding: the encoding
            - timest_columns: the list of column that contain timestamp
            - timest_format: the format of ALL the timest_columns

    Returns
    -------------
    void
    """
    if parameters is None:
        parameters = {}

    # create the folder
    try:
        os.mkdir(target_path)
    except:
        pass

    # create the partitions
    dataframe = pd.DataFrame({})
    for i in range(target_num_partitions):
        tp = os.path.join(target_path, str(i)+".parquet")
        parquet_exporter.apply(dataframe, tp)
    files = os.listdir(source_path)
    for index, file in enumerate(files):
        if file.lower().endswith("csv"):
            sp = os.path.join(source_path, file)
            source_df = csv_importer.import_dataframe_from_path(sp, parameters=parameters)
            if activity_key != xes.DEFAULT_NAME_KEY and xes.DEFAULT_NAME_KEY not in source_df.columns:
                source_df[xes.DEFAULT_NAME_KEY] = source_df[activity_key]
            if timestamp_key != xes.DEFAULT_TIMESTAMP_KEY and xes.DEFAULT_TIMESTAMP_KEY not in source_df.columns:
                source_df[xes.DEFAULT_TIMESTAMP_KEY] = source_df[timestamp_key]
            if caseid_key != CASE_CONCEPT_NAME  and CASE_CONCEPT_NAME not in source_df.columns:
                source_df[CASE_CONCEPT_NAME] = source_df[caseid_key]
            source_df["@@partition"] = source_df[caseid_key].apply(hash)
            source_df["@@partition"] = source_df["@@partition"] % target_num_partitions
            for i in range(target_num_partitions):
                tp = os.path.join(target_path, str(i)+".parquet")
                df2 = source_df[source_df["@@partition"] == i]
                del df2["@@partition"]
                #df2 = df2.reset_index()
                df1 = parquet_importer.apply(tp)
                df = pd.concat([df1, df2])
                if index == len(files)-1:
                    df = df.sort_values([caseid_key, timestamp_key])
                print("input %d/%d output %d/%d len(df)=" % (index+1,len(files),i+1,target_num_partitions),len(df))
                parquet_exporter.apply(df, tp)
 def test_df_tree_variants_dfg_based_old(self):
     df = csv_importer.import_dataframe_from_path(
         os.path.join("input_data", "running-example.csv"))
     tree = inductive_miner.apply_tree_variants(
         pd_variants.get_variants_set(df),
         variant=inductive_miner.DFG_BASED_OLD_VERSION)
 def test_df_variants_dfg_based(self):
     df = csv_importer.import_dataframe_from_path(
         os.path.join("input_data", "running-example.csv"))
     net, im, fm = inductive_miner.apply_variants(
         pd_variants.get_variants_set(df),
         variant=inductive_miner.DFG_BASED)
 def test_dataframe_dfg_based_old(self):
     df = csv_importer.import_dataframe_from_path(
         os.path.join("input_data", "running-example.csv"))
     net, im, fm = inductive_miner.apply(
         df, variant=inductive_miner.DFG_BASED_OLD_VERSION)