Ejemplo n.º 1
0
def export_to_file(log, file_path, parameters=None):
    """
    Serialize a log object to the content of a file

    Parameters
    --------------
    log
        Event log
    file_path
        File path (if None, then a temp file is targeted)
    parameters
        Possible parameters of the algorithm

    Returns
    --------------
    file_path
        File path
    """
    if parameters is None:
        parameters = {}

    if file_path is None:
        file_path = tempfile.NamedTemporaryFile(suffix=".parquet")
        file_path.close()
        file_path = file_path.name

    from pm4py.objects.log.exporter.parquet import factory as parquet_exporter
    parquet_exporter.apply(log, file_path, parameters=parameters)

    return file_path
Ejemplo n.º 2
0
def convert_xes_file_to_parquet(xes_input_path: str, parquet_output_path: str):
    """Converts a XES file at the given location to a parquet file (via pandas dataframes)

    Arguments:
        xes_input_path {str} -- The filepath the XES file should be read from
        parquet_output_path {str} -- The filepath the new parquet file should be written to
    """
    log = xes_importer.apply(xes_input_path)
    parquet_exporter.apply(log, parquet_output_path)
Ejemplo n.º 3
0
def transform_simple(source_path, target_path, activity_key=xes.DEFAULT_NAME_KEY, timestamp_key=xes.DEFAULT_TIMESTAMP_KEY, caseid_key=CASE_CONCEPT_NAME, parameters=None):
    """
    Transform a list of CSV into a list of corresponding Parquet files (to create a distributed Parquet dataset)
    with the assumption that the events of the same case ID belongs to the same CSV

    Parameters
    -------------
    source_path
        Source path (several CSV dataset)
    target_path
        Target path (distributed Parquet dataset)
    activity_key
        Column that is the activity
    timestamp_key
        Column that is the timestamp
    caseid_key
        Column that is the case ID
    parameters
        Possible parameters of the algorithm, including:
            - sep: the separator
            - quotechar: the quotechar
            - encoding: the encoding
            - timest_columns: the list of column that contain timestamp
            - timest_format: the format of ALL the timest_columns
    """
    if parameters is None:
        parameters = {}

    # create the folder
    try:
        os.mkdir(target_path)
    except:
        pass

    # iterate one-by-one on the files of the source folder
    files = os.listdir(source_path)
    for index, file in enumerate(files):
        if file.lower().endswith("csv"):
            sp = os.path.join(source_path, file)
            df = csv_importer.import_dataframe_from_path(sp, parameters=parameters)
            if activity_key != xes.DEFAULT_NAME_KEY and xes.DEFAULT_NAME_KEY not in df.columns:
                df[xes.DEFAULT_NAME_KEY] = df[activity_key]
            if timestamp_key != xes.DEFAULT_TIMESTAMP_KEY and xes.DEFAULT_TIMESTAMP_KEY not in df.columns:
                df[xes.DEFAULT_TIMESTAMP_KEY] = df[timestamp_key]
            if caseid_key != CASE_CONCEPT_NAME  and CASE_CONCEPT_NAME not in df.columns:
                df[CASE_CONCEPT_NAME] = df[caseid_key]
            tp = os.path.join(target_path, str(index)+".parquet")
            df = df.sort_values([caseid_key, timestamp_key])
            parquet_exporter.apply(df, tp)

if __name__ == "__main__":
    # Refresh the master, training, and slave folders
    refresh_directory("master")
    refresh_directory("testing")
    refresh_directory("slave1")
    refresh_directory("slave2")
    refresh_directory("Data/Ensembles")

    # Import event log and partition it
    log = xes_importer.apply("Data/Event Logs/receipt.xes")
    parquet_exporter.apply(log,
                           "master",
                           parameters={
                               "auto_partitioning":
                               True,
                               "num_partitions":
                               pm4pydistr.configuration.NUMBER_OF_PARTITIONS
                           })

    # Move the training log to the training folder
    for i in range(
            int(pm4pydistr.configuration.NUMBER_OF_PARTITIONS *
                pm4pydistr.configuration.TRAINING_PART),
            pm4pydistr.configuration.NUMBER_OF_PARTITIONS):
        shutil.move("master\\@@partitioning=" + str(i),
                    "testing\\@@partitioning=" + str(i))

    # Initialize master and slaves
    t1 = ExecutionThread(PYTHON_PATH +
                         " launch.py type master conf master port 5001")
Ejemplo n.º 5
0
def transform_csv_dataset_to_parquet_distr_dataset(source_path, target_path, target_num_partitions, activity_key=xes.DEFAULT_NAME_KEY, timestamp_key=xes.DEFAULT_TIMESTAMP_KEY, caseid_key=CASE_CONCEPT_NAME, parameters=None):
    """
    Transforms the CSV dataset to a Parquet distributed dataset

    Parameters
    -------------
    source_path
        Source path (several CSV dataset)
    target_path
        Target path (distributed Parquet dataset)
    target_num_partitions
        Target number of partitions (number of divisions of the output)
    activity_key
        Column that is the activity
    timestamp_key
        Column that is the timestamp
    caseid_key
        Column that is the case ID
    parameters
        Possible parameters of the algorithm, including:
            - sep: the separator
            - quotechar: the quotechar
            - encoding: the encoding
            - timest_columns: the list of column that contain timestamp
            - timest_format: the format of ALL the timest_columns

    Returns
    -------------
    void
    """
    if parameters is None:
        parameters = {}

    # create the folder
    try:
        os.mkdir(target_path)
    except:
        pass

    # create the partitions
    dataframe = pd.DataFrame({})
    for i in range(target_num_partitions):
        tp = os.path.join(target_path, str(i)+".parquet")
        parquet_exporter.apply(dataframe, tp)
    files = os.listdir(source_path)
    for index, file in enumerate(files):
        if file.lower().endswith("csv"):
            sp = os.path.join(source_path, file)
            source_df = csv_importer.import_dataframe_from_path(sp, parameters=parameters)
            if activity_key != xes.DEFAULT_NAME_KEY and xes.DEFAULT_NAME_KEY not in source_df.columns:
                source_df[xes.DEFAULT_NAME_KEY] = source_df[activity_key]
            if timestamp_key != xes.DEFAULT_TIMESTAMP_KEY and xes.DEFAULT_TIMESTAMP_KEY not in source_df.columns:
                source_df[xes.DEFAULT_TIMESTAMP_KEY] = source_df[timestamp_key]
            if caseid_key != CASE_CONCEPT_NAME  and CASE_CONCEPT_NAME not in source_df.columns:
                source_df[CASE_CONCEPT_NAME] = source_df[caseid_key]
            source_df["@@partition"] = source_df[caseid_key].apply(hash)
            source_df["@@partition"] = source_df["@@partition"] % target_num_partitions
            for i in range(target_num_partitions):
                tp = os.path.join(target_path, str(i)+".parquet")
                df2 = source_df[source_df["@@partition"] == i]
                del df2["@@partition"]
                #df2 = df2.reset_index()
                df1 = parquet_importer.apply(tp)
                df = pd.concat([df1, df2])
                if index == len(files)-1:
                    df = df.sort_values([caseid_key, timestamp_key])
                print("input %d/%d output %d/%d len(df)=" % (index+1,len(files),i+1,target_num_partitions),len(df))
                parquet_exporter.apply(df, tp)