Esempio n. 1
0
 def test_csv(self):
     df = pd.read_csv("input_data/running-example.csv")
     df = pm4py.format_dataframe(df, case_id="case:concept:name", activity_key="concept:name",
                                 timestamp_key="time:timestamp")
     log2 = pm4py.convert_to_event_log(df)
     stream1 = pm4py.convert_to_event_stream(log2)
     df2 = pm4py.convert_to_dataframe(log2)
     pm4py.write_xes(log2, "test_output_data/log.xes")
     os.remove("test_output_data/log.xes")
Esempio n. 2
0
def apply(log: Union[DataFrame, EventLog, EventStream],
          parameters: Optional[Dict[str, Any]] = None) -> Dict[str, int]:
    '''
    This algorithm computes the minimum self-distance for each activity observed in an event log.
    The self distance of a in <a> is infinity, of a in <a,a> is 0, in <a,b,a> is 1, etc.
    The minimum self distance is the minimal observed self distance value in the event log.
    The activity key needs to be specified in the parameters input object (if None, default value 'concept:name' is used).


    Parameters
    ----------
    log
        event log (either pandas.DataFrame, EventLog or EventStream)
    parameters
        parameters object;

    Returns
    -------
        dict mapping an activity to its self-distance, if it exists, otherwise it is not part of the dict.
    '''
    log = pm4py.convert_to_event_log(log)
    act_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters,
                                         xes_constants.DEFAULT_NAME_KEY)
    alphabet = pm4py.get_attribute_values(log, act_key)
    log = list(map(lambda t: list(map(lambda e: e[act_key], t)), log))
    min_self_distances = dict()
    for a in alphabet:
        if len(
                list(
                    filter(
                        lambda t: len(t) > 1,
                        list(
                            map(lambda t: list(filter(lambda e: e == a, t)),
                                log))))) > 0:
            activity_indices = list(
                filter(
                    lambda t: len(t) > 1,
                    list(
                        map(lambda t: [i for i, x in enumerate(t) if x == a],
                            log))))
            min_self_distances[a] = min([
                i for l in list(
                    map(
                        lambda t: [
                            t[i] - t[i - 1] - 1 for i, x in enumerate(t)
                            if i > 0
                        ], activity_indices)) for i in l
            ])
    return min_self_distances
    def convertLog(self,
                   centers,
                   y_pred,
                   name,
                   encoding,
                   alg,
                   datapath,
                   exportSes=False):
        start = time.time()
        frames = []
        log = pd.DataFrame()
        for i, s in enumerate(self.sessions):
            abstracted = s.convertSession(centers[y_pred[i]], y_pred[i],
                                          self.distinct, self.attrNames)
            frames.append(abstracted)
        log = pd.concat(frames, ignore_index=True)
        log = pm.format_dataframe(log,
                                  case_id='case',
                                  activity_key='concept:name',
                                  timestamp_key='time:timestamp')
        num = math.ceil(len(log) * 0.7)

        log1 = log[:num]
        log2 = log[num:]

        log = pm.convert_to_event_log(log)
        log1 = pm.convert_to_event_log(log1)
        log2 = pm.convert_to_event_log(log2)

        pm.write_xes(log1, os.path.join(datapath, name + "train.xes"))
        pm.write_xes(log2, os.path.join(datapath, name + "test.xes"))
        pm.write_xes(log, os.path.join(datapath, name + ".xes"))

        if exportSes:
            self.exportSubP(y_pred, centers, name, encoding, alg)
        print("Convertion Time:", time.time() - start)
Esempio n. 4
0
def derive_msd_witnesses(
        log: EventLog,
        msd: Optional[Dict[str, int]] = None,
        parameters: Optional[Dict[str, Any]] = None) -> Dict[str, Set[str]]:
    '''
    This function derives the minimum self distance witnesses.
    The self distance of a in <a> is infinity, of a in <a,a> is 0, in <a,b,a> is 1, etc.
    The minimum self distance is the minimal observed self distance value in the event log.
    A 'witness' is an activity that witnesses the minimum self distance.
    For example, if the minimum self distance of activity a in some log L is 2, then,
    if trace <a,b,c,a> is in log L, b and c are a witness of a.

    Parameters
    ----------
    log
        Event Log to use
    msd
        Optional minimum self distance dictionary
    parameters
        Optional parameters dictionary

    Returns
    -------
    Dictionary mapping each activity to a set of witnesses.

    '''
    log = pm4py.convert_to_event_log(log)
    act_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters,
                                         xes_constants.DEFAULT_NAME_KEY)
    alphabet = pm4py.get_attribute_values(log, act_key)
    msd = msd if msd is not None else msd_algo.apply(log, parameters)
    log = list(map(lambda t: list(map(lambda e: e[act_key], t)), log))
    witnesses = dict()
    for a in alphabet:
        if a in msd and msd[a] > 0:
            witnesses[a] = set()
        else:
            continue
        for t in log:
            if len(list(filter(lambda e: e == a, t))) > 1:
                indices = [i for i, x in enumerate(t) if x == a]
                for i in range(len(indices) - 1):
                    if indices[i + 1] - indices[i] - 1 == msd[a]:
                        for b in t[indices[i] + 1:indices[i + 1]]:
                            witnesses[a].add(b)
    return witnesses
 def exportSubP(self, y_pred, center, name, encoding, alg):
     path = os.pardir + '/outputLog/' + encoding + '/' + alg
     try:
         os.makedirs(path, exist_ok=True)
     except OSError:
         print("Creation of the directory %s failed" % path)
     else:
         print("Successfully created the directory %s " % path)
         frames = [[] for i in range(max(y_pred) + 1)]
         for i, s in enumerate(self.sessions):
             frames[y_pred[i]].extend(s.export(self.attrNames, i))
         for i in range(max(y_pred) + 1):
             ind = list(
                 np.flip(np.argsort(centers[i][:len(self.distinct)])[-1:]))
             subP = concatName(self.distinct, ind)
             newFile = name + str(i) + subP + '.xes'
             log = pd.concat(frames[i], ignore_index=True)
             log = pm.format_dataframe(log,
                                       case_id='case',
                                       activity_key='concept:name',
                                       timestamp_key='time:timestamp')
             log = pm.convert_to_event_log(log)
             pm.write_xes(log, os.path.join(path, newFile))
         print("Sessions exported")
Esempio n. 6
0
def execute_script():
    ENABLE_VISUALIZATION = True

    # reads a XES into an event log
    log1 = pm4py.read_xes("../tests/input_data/running-example.xes")

    # reads a CSV into a dataframe
    df = pd.read_csv("../tests/input_data/running-example.csv")
    # formats the dataframe with the mandatory columns for process mining purposes
    df = pm4py.format_dataframe(df,
                                case_id="case:concept:name",
                                activity_key="concept:name",
                                timestamp_key="time:timestamp")
    # converts the dataframe to an event log
    log2 = pm4py.convert_to_event_log(df)

    # converts the log read from XES into a stream and dataframe respectively
    stream1 = pm4py.convert_to_event_stream(log1)
    df2 = pm4py.convert_to_dataframe(log1)

    # writes the log1 to a XES file
    pm4py.write_xes(log1, "ru1.xes")

    dfg, dfg_sa, dfg_ea = pm4py.discover_dfg(log1)
    petri_alpha, im_alpha, fm_alpha = pm4py.discover_petri_net_alpha(log1)
    petri_inductive, im_inductive, fm_inductive = pm4py.discover_petri_net_inductive(
        log1)
    petri_heuristics, im_heuristics, fm_heuristics = pm4py.discover_petri_net_heuristics(
        log1)
    tree_inductive = pm4py.discover_tree_inductive(log1)
    heu_net = pm4py.discover_heuristics_net(log1)

    pm4py.write_dfg(dfg, dfg_sa, dfg_ea, "ru_dfg.dfg")
    pm4py.write_petri_net(petri_alpha, im_alpha, fm_alpha, "ru_alpha.pnml")
    pm4py.write_petri_net(petri_inductive, im_inductive, fm_inductive,
                          "ru_inductive.pnml")
    pm4py.write_petri_net(petri_heuristics, im_heuristics, fm_heuristics,
                          "ru_heuristics.pnml")
    pm4py.write_process_tree(tree_inductive, "ru_inductive.ptml")

    dfg, dfg_sa, dfg_ea = pm4py.read_dfg("ru_dfg.dfg")
    petri_alpha, im_alpha, fm_alpha = pm4py.read_petri_net("ru_alpha.pnml")
    petri_inductive, im_inductive, fm_inductive = pm4py.read_petri_net(
        "ru_inductive.pnml")
    petri_heuristics, im_heuristics, fm_heuristics = pm4py.read_petri_net(
        "ru_heuristics.pnml")
    tree_inductive = pm4py.read_process_tree("ru_inductive.ptml")

    pm4py.save_vis_petri_net(petri_alpha, im_alpha, fm_alpha, "ru_alpha.png")
    pm4py.save_vis_petri_net(petri_inductive, im_inductive, fm_inductive,
                             "ru_inductive.png")
    pm4py.save_vis_petri_net(petri_heuristics, im_heuristics, fm_heuristics,
                             "ru_heuristics.png")
    pm4py.save_vis_process_tree(tree_inductive, "ru_inductive_tree.png")
    pm4py.save_vis_heuristics_net(heu_net, "ru_heunet.png")
    pm4py.save_vis_dfg(dfg, dfg_sa, dfg_ea, "ru_dfg.png")

    if ENABLE_VISUALIZATION:
        pm4py.view_petri_net(petri_alpha, im_alpha, fm_alpha, format="svg")
        pm4py.view_petri_net(petri_inductive,
                             im_inductive,
                             fm_inductive,
                             format="svg")
        pm4py.view_petri_net(petri_heuristics,
                             im_heuristics,
                             fm_heuristics,
                             format="svg")
        pm4py.view_process_tree(tree_inductive, format="svg")
        pm4py.view_heuristics_net(heu_net, format="svg")
        pm4py.view_dfg(dfg, dfg_sa, dfg_ea, format="svg")

    aligned_traces = pm4py.conformance_alignments(log1, petri_inductive,
                                                  im_inductive, fm_inductive)
    replayed_traces = pm4py.conformance_tbr(log1, petri_inductive,
                                            im_inductive, fm_inductive)

    fitness_tbr = pm4py.evaluate_fitness_tbr(log1, petri_inductive,
                                             im_inductive, fm_inductive)
    print("fitness_tbr", fitness_tbr)
    fitness_align = pm4py.evaluate_fitness_alignments(log1, petri_inductive,
                                                      im_inductive,
                                                      fm_inductive)
    print("fitness_align", fitness_align)
    precision_tbr = pm4py.evaluate_precision_tbr(log1, petri_inductive,
                                                 im_inductive, fm_inductive)
    print("precision_tbr", precision_tbr)
    precision_align = pm4py.evaluate_precision_alignments(
        log1, petri_inductive, im_inductive, fm_inductive)
    print("precision_align", precision_align)

    print("log start activities = ", pm4py.get_start_activities(log2))
    print("df start activities = ", pm4py.get_start_activities(df2))
    print("log end activities = ", pm4py.get_end_activities(log2))
    print("df end activities = ", pm4py.get_end_activities(df2))
    print("log attributes = ", pm4py.get_attributes(log2))
    print("df attributes = ", pm4py.get_attributes(df2))
    print("log org:resource values = ",
          pm4py.get_attribute_values(log2, "org:resource"))
    print("df org:resource values = ",
          pm4py.get_attribute_values(df2, "org:resource"))

    print("start_activities len(filt_log) = ",
          len(pm4py.filter_start_activities(log2, ["register request"])))
    print("start_activities len(filt_df) = ",
          len(pm4py.filter_start_activities(df2, ["register request"])))
    print("end_activities len(filt_log) = ",
          len(pm4py.filter_end_activities(log2, ["pay compensation"])))
    print("end_activities len(filt_df) = ",
          len(pm4py.filter_end_activities(df2, ["pay compensation"])))
    print(
        "attributes org:resource len(filt_log) (cases) cases = ",
        len(
            pm4py.filter_attribute_values(log2,
                                          "org:resource", ["Ellen"],
                                          level="case")))
    print(
        "attributes org:resource len(filt_log) (cases)  events = ",
        len(
            pm4py.filter_attribute_values(log2,
                                          "org:resource", ["Ellen"],
                                          level="event")))
    print(
        "attributes org:resource len(filt_df) (events) cases = ",
        len(
            pm4py.filter_attribute_values(df2,
                                          "org:resource", ["Ellen"],
                                          level="case")))
    print(
        "attributes org:resource len(filt_df) (events) events = ",
        len(
            pm4py.filter_attribute_values(df2,
                                          "org:resource", ["Ellen"],
                                          level="event")))
    print(
        "attributes org:resource len(filt_df) (events) events notpositive = ",
        len(
            pm4py.filter_attribute_values(df2,
                                          "org:resource", ["Ellen"],
                                          level="event",
                                          retain=False)))

    print("variants log = ", pm4py.get_variants(log2))
    print("variants df = ", pm4py.get_variants(df2))
    print(
        "variants filter log = ",
        len(
            pm4py.filter_variants(log2, [[
                "register request", "examine thoroughly", "check ticket",
                "decide", "reject request"
            ]])))
    print(
        "variants filter df = ",
        len(
            pm4py.filter_variants(df2, [[
                "register request", "examine thoroughly", "check ticket",
                "decide", "reject request"
            ]])))
    print("variants filter percentage = ",
          len(pm4py.filter_variants_percentage(log2, threshold=0.8)))

    print(
        "paths filter log len = ",
        len(
            pm4py.filter_directly_follows_relation(
                log2, [("register request", "examine casually")])))
    print(
        "paths filter dataframe len = ",
        len(
            pm4py.filter_directly_follows_relation(
                df2, [("register request", "examine casually")])))

    print(
        "timeframe filter log events len = ",
        len(
            pm4py.filter_time_range(log2,
                                    "2011-01-01 00:00:00",
                                    "2011-02-01 00:00:00",
                                    mode="events")))
    print(
        "timeframe filter log traces_contained len = ",
        len(
            pm4py.filter_time_range(log2,
                                    "2011-01-01 00:00:00",
                                    "2011-02-01 00:00:00",
                                    mode="traces_contained")))
    print(
        "timeframe filter log traces_intersecting len = ",
        len(
            pm4py.filter_time_range(log2,
                                    "2011-01-01 00:00:00",
                                    "2011-02-01 00:00:00",
                                    mode="traces_intersecting")))
    print(
        "timeframe filter df events len = ",
        len(
            pm4py.filter_time_range(df2,
                                    "2011-01-01 00:00:00",
                                    "2011-02-01 00:00:00",
                                    mode="events")))
    print(
        "timeframe filter df traces_contained len = ",
        len(
            pm4py.filter_time_range(df2,
                                    "2011-01-01 00:00:00",
                                    "2011-02-01 00:00:00",
                                    mode="traces_contained")))
    print(
        "timeframe filter df traces_intersecting len = ",
        len(
            pm4py.filter_time_range(df2,
                                    "2011-01-01 00:00:00",
                                    "2011-02-01 00:00:00",
                                    mode="traces_intersecting")))

    # remove the temporary files
    os.remove("ru1.xes")
    os.remove("ru_dfg.dfg")
    os.remove("ru_alpha.pnml")
    os.remove("ru_inductive.pnml")
    os.remove("ru_heuristics.pnml")
    os.remove("ru_inductive.ptml")
    os.remove("ru_alpha.png")
    os.remove("ru_inductive.png")
    os.remove("ru_heuristics.png")
    os.remove("ru_inductive_tree.png")
    os.remove("ru_heunet.png")
    os.remove("ru_dfg.png")
Esempio n. 7
0
def gerar_log_eventos(ramo_justica,
                      codtribunal,
                      atuacao,
                      cluster,
                      grau,
                      codorgaoj,
                      codnatureza,
                      codclasse,
                      dtinicio,
                      dtfim,
                      baixado=None,
                      sensibility='60'):

    eventLog = None

    cacheKey = "{0}-{1}-{2}-{3}-{4}-{5}-{6}-{7}-{8}-{9}-{10}".format(
        ramo_justica, codtribunal, atuacao, cluster, grau, codorgaoj,
        codnatureza, codclasse, dtinicio, dtfim, baixado)

    cachedEventLog = eventLogCache.get(cacheKey)
    if cachedEventLog is not None:
        eventLog = cachedEventLog

    else:
        conn = psycopg2.connect(host=db_host,
                                port=db_port,
                                database=db_name,
                                user=db_user,
                                password=db_pass)

        sufixo_ramo = ramos_justica.get(ramo_justica, 'default')

        tabela_fato = "inovacnj.fat_movimento_" + sufixo_ramo

        qry = "SELECT "
        qry += "  fat.npu as npu, "
        qry += "  CASE "
        qry += "  WHEN f.descricao IS NULL THEN fat.mov_cod ||  ' - ' || mov.descricao "
        qry += "  ELSE f.descricao || ': ' || fat.mov_cod ||  ' - ' || mov.descricao "
        qry += "  END AS atividade, "
        qry += "  fat.mov_dtmov as mov_dtmov "
        qry += "FROM " + tabela_fato + " fat "
        qry += "INNER JOIN inovacnj.acervo_processo_" + sufixo_ramo + " ap ON ap.npu = fat.npu "
        qry += "INNER JOIN inovacnj.orgao_julgador oj ON oj.cod::varchar = fat.oj_cod "
        qry += "INNER JOIN inovacnj.clusteroj_orgjulg cojoj ON cojoj.cod_orgao_julg = oj.cod "
        qry += "INNER JOIN inovacnj.movimentocnj mov ON mov.cod = fat.mov_cod "
        qry += "INNER JOIN inovacnj.natureza_classe nc ON nc.cod_classe = fat.codclasse "
        qry += "INNER JOIN inovacnj.natureza nat ON nat.cod = nc.cod_natureza "
        qry += "LEFT JOIN inovacnj.fase_movimento fm ON fm.cod_movimento = fat.mov_cod "
        qry += "LEFT JOIN inovacnj.fase f ON f.cod = fm.cod_fase "
        qry += "WHERE (1=1) "

        if baixado is not None:
            qry += "AND ap.baixado = '" + baixado + "' "
        if codtribunal is not None:
            qry += "AND fat.codtribunal = '" + codtribunal + "' "
        if atuacao is not None:
            qry += "AND oj.atuacao_vara = '" + atuacao + "' "
        if cluster is not None:
            qry += "AND cojoj.cod_cluster = " + cluster + " "
        if codorgaoj is not None:
            qry += "AND fat.oj_cod = '" + codorgaoj + "' "
        if grau is not None:
            qry += "AND fat.grau = '" + grau + "' "
        if codnatureza is not None:
            qry += "AND nat.cod = " + str(codnatureza) + " "
        if codclasse is not None:
            qry += "AND fat.codclasse = " + str(codclasse) + " "

        if dtinicio is not None and dtfim is not None:
            qry += "AND fat.mov_dtmov BETWEEN to_timestamp('" + dtinicio + "', 'yyyy-MM-dd') AND to_timestamp('" + dtfim + "', 'yyyy-MM-dd') "

        qry += "ORDER BY fat.npu, fat.mov_dtmov ASC "

        df_logeventos_pd = pd.read_sql_query(qry, conn)

        if df_logeventos_pd.empty == False:
            df_event_log = pm4py.format_dataframe(df_logeventos_pd,
                                                  case_id='npu',
                                                  activity_key='atividade',
                                                  timestamp_key='mov_dtmov')
            eventLog = pm4py.convert_to_event_log(df_event_log)

            eventLogCache[cacheKey] = eventLog
            #limpa da cache depois de 10 minutos
            timer3.apply_after(1000 * 60 * 15,
                               clear_eventlog_cache,
                               args=([cacheKey]),
                               priority=0)

    if eventLog is not None:
        if sensibility is not None:
            eventLog = pm4py.filter_variants_percentage(
                eventLog, percentage=float(sensibility) / 100)

    return eventLog
Esempio n. 8
0
def lstm_algorithm(index, len_of_points, resample_dataset, LSTM_CELLS):

    #print('Current path: ',os.getcwd())
    if not os.path.exists('results'):
        os.makedirs('results')

    COLOR_CYCLE = ["#4286f4", "#f44174"]

    split_percentage = 0.8
    """                        
    answer = input('Give me the length of window: ')
    if answer == 'max':
        len_of_points = 0
    else:
        len_of_points = int(answer)
       
    #len_of_points = 3
    """

    fileinfo = {
        0: {
            'filename': 'base_kasteren.csv',
            'separator': ' ',
            'columns':
            ['date', 'time', 'attr1', 'attr2', 'state', 'concept:name']
        },
        1: {
            'filename':
            'activity3.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        2: {
            'filename': 'activitylog_uci_detailed_labour.xes',
            'separator': '',
            'columns': []
        },
        3: {
            'filename': 'atmo1.csv',
            'separator': ' ',
            'columns': ['date', 'time', 'concept:name', 'state', 'activity']
        },
        4: {
            'filename':
            'activity1.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        5: {
            'filename':
            'activity2.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        6: {
            'filename': 'espa.xes',
            'separator': ';',
            'columns': []
        },
        7: {
            'filename':
            'activity3.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        8: {
            'filename':
            'activity4.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        9: {
            'filename':
            'activity5.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        10: {
            'filename':
            'activity6.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        11: {
            'filename':
            'activity7.csv',
            'separator':
            ',',
            'columns': [
                'id', 'case:concept:name', 'subjectID', 'attr_starttime',
                'time:timestamp', 'concept:name', 'label_subactivity'
            ]
        },
        12: {
            'filename': 'BPI_Challenge_2017.xes',
            'separator': '',
            'columns': []
        },
    }

    #choose file

    filename = fileinfo[index]['filename']
    filepath = '../datasets/' + fileinfo[index]['filename']

    dataframe = pd.DataFrame()
    if not os.path.exists('results/' + filename):
        os.makedirs('results/' + filename)
    if not os.path.exists('results/' + filename + '/' + str(len_of_points) +
                          '/'):
        os.makedirs('results/' + filename + '/' + str(len_of_points) + '/')

    #if it is a csv file
    if (filename.find('.csv') != -1):
        #load file to dataframe
        dataframe = pd.read_csv(filepath,
                                sep=fileinfo[index]['separator'],
                                names=fileinfo[index]['columns'],
                                low_memory=False)
        #for Kastern dataset prepare columns
        if index in [0, 3, 12]:
            dataframe[
                'time:timestamp'] = dataframe['date'] + ' ' + dataframe['time']
            dataframe['case:concept:name'] = dataframe['date']
            #dataframe = dataframe[dataframe['concept:name']!='None']

        #print ("file is csv ")
        #print(dataframe.head(20))

        #drop nan

        #convert csv to xes
        log = pm4py.convert_to_event_log(dataframe)

    else:
        #the file is xes
        #import log
        #xes_importer.iterparse.Parameters.MAX_TRACES = 10
        #parameters = {xes_importer.iterparse.Parameters.MAX_TRACES: 50}
        #log = xes_importer.apply('datasets/BPI Challenge 2018.xes.gz', parameters=parameters)
        log = pm4py.read_xes(filepath)
        print(log)
        #convert to dataframe
        dataframe = pm4py.convert_to_dataframe(log)
        print(dataframe)
        #print(dataframe['time:timestamp'][0].replace(tzinfo=timezone.utc).astimezone(tz=None))
        #dataframe['time:timestamp'] = dataframe['time:timestamp'].dt.tz_convert(None)

    if index in [2, 12]:
        #process time:timestamp remove zone information
        dataframe['time:timestamp'] = dataframe[
            'time:timestamp'].dt.tz_convert(None)

    #del log
    print('Dataframe print\n', dataframe)
    #get only start events if lifecycle:transition if column does not exists create it
    if 'lifecycle:transition' in dataframe.columns:

        dataframe = dataframe[dataframe['lifecycle:transition'] == 'complete']

    else:
        dataframe['lifecycle:transition'] = 'complete'

    #remove Start and End events
    dataframe = dataframe[dataframe['concept:name'] != 'Start']
    dataframe = dataframe[dataframe['concept:name'] != 'End']

    #sort by time
    if 'time:timestamp' in dataframe.columns:
        dataframe = dataframe.sort_values('time:timestamp')
    else:
        print('Error: no column time:timestamp in event log')

    #print('Sorted dataframe\n',dataframe)

    #plot time vs activity
    #fig, axes = plt.subplots(1, 1, figsize=(100, 100))
    #fig = dataframe.plot(x='time:timestamp', y='concept:name', kind="scatter").get_figure()
    #fig.savefig('results/'+filename+'/'+str(len_of_points) +'/conceptname.png', bbox_inches='tight')

    #plot time vs trace id
    #df.plot(x='col_name_1', y='col_name_2', style='o')
    #fig = dataframe.plot(x='time:timestamp', y='case:concept:name', kind="scatter").get_figure()
    #fig.savefig('results/'+filename+'/'+str(len_of_points) +'/caseconceptname.png', bbox_inches='tight')

    #keep only mandatory columns
    dataframe = dataframe[[
        'case:concept:name', 'concept:name', 'time:timestamp'
    ]]
    #convert sorted dataframe to log
    log = pm4py.convert_to_event_log(dataframe)

    #initial_df = dataframe.copy()
    #print('Initial dataframe\n',initial_df)
    #-----------------------------------------------------------------
    ############################################################
    #-------------- Resample -----------------------------------
    ###########################################################
    if resample_dataset:
        #preprocess timestamp to be prepared for resample
        #make time:timestamp datetime
        dataframe.loc[:, 'time:timestamp'] = pd.to_datetime(
            dataframe['time:timestamp'])
        #set time:timestamp as index
        dataframe = dataframe.set_index(["time:timestamp"])
        #remove duplicates
        #print('Duplicated\n')
        #print(dataframe[dataframe.index.duplicated()])
        dataframe = dataframe[~dataframe.index.duplicated(keep='first')]

        #------Reasample dataframe every 5min keep last value found if Nan-------------
        dataframe = dataframe.resample("5T").fillna("backfill")
        print('Resample', dataframe)
        #print( dataframe.last())

    #save resampled dataframe to csv
    dataframe.to_csv('../datasets/resampled_sorted_df.csv')

    #dataframe is initial event log sorted by time (start event only)
    #convert sorted by time dataframe back to log (xes)
    #log = pm4py.convert_to_event_log(dataframe)

    #-----------------------------------save to csv-------------------------------------
    #uncomment only if you need it
    #dataframe.to_csv('datasets/activitylog_uci_detailed_labour.csv')

    #print('\nDataframe LOG\n',dataframe)

    #--------------- Concat activities of a trace in one row ---------------
    #concat events with same case:concept:name (space separated)
    print('dataframe\n', dataframe)
    df = dataframe.groupby('case:concept:name', sort=False).agg(
        {'concept:name': lambda x: ' '.join(x)})
    print('df\n', df)
    df = df.reset_index()
    if len_of_points:
        print('--------------------------------------')
        df['concept:name'] = df['concept:name'].apply(
            lambda x: list(x.split(' ')))
        df['concept:name'] = df['concept:name'].apply(
            lambda x:
            [x[i:i + len_of_points] for i in range(0, len(x), len_of_points)])
        df = df.set_index('case:concept:name')['concept:name'].apply(
            pd.Series).stack().reset_index(level=0).rename(
                columns={0: 'concept:name'})
        df['concept:name'] = df['concept:name'].apply(lambda x: ' '.join(x))
        print('\ndftest\n', df)

    df = df.reset_index()  #check here

    #del dataframe

    #print the activities of the log
    activities = pm4py.get_attribute_values(log, 'concept:name')
    print('\nActivities:\n', activities)

    #split data from event log - 80% for train and 20% for test

    #shuffle before split
    #df = shuffle(df)
    #print('df', df,'\n')

    #----------------------- Split Train and Test data ----------------------
    #split rows depending on percentage
    split_rows = int(df.shape[0] * split_percentage)
    print('Split Rows', split_rows, '\n')

    #train dataframe
    train_df = df[:split_rows]
    train_df.to_csv('train.csv')
    print('Train Rows', train_df, '\n')

    #test dataframe
    test_df = df[split_rows:]
    test_df.to_csv('test.csv')
    #print('Test Rows', test_df,'\n')

    # --------------------------------------------------------------

    #data = df['concept:name'].copy().to_list()
    data = train_df['concept:name'].copy().to_list()
    #Just for Eating/Drinking
    #data = data.replace('Eating/Drinking','EatDrink')
    #print('Data\n',data)

    tokenizer = Tokenizer()
    #reads the words in data and gives an index for every words based on frequency
    tokenizer.fit_on_texts([data])
    print('Word index: ')
    print(tokenizer.word_index)

    #replace every word in the text to correspoding word index - returns list of list with one element so use [0] to get the one and only first list
    encoded = tokenizer.texts_to_sequences([data])[0]
    #print('encoded: \n')
    #print(encoded)
    vocab_size = len(tokenizer.word_index) + 1
    print('Vocabulary Size: %d' % vocab_size)
    #print('list\n',[e for e in encoded])
    #print('Min ',min([len(e) for e in encoded]))

    # LSTM 3 timesteps - prepare data - encode 2 words -> 1 word
    sequences = list()
    for i in range(n_input, len(encoded)):
        sequence = encoded[i - n_input:i + 1]
        sequences.append(sequence)
    print('Total Sequences: %d' % len(sequences))
    print('Sequences: \n')
    print(sequences)

    max_length = max([len(seq) for seq in sequences])  #max_length is 3
    # Pad sequence to be of the same length
    # length of sequence must be 3 (maximum)
    # 'pre' or 'post': pad either before or after each sequence
    sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
    print('Max Sequence Length: %d' % max_length)

    #convert list to array to get X,y train
    sequences = array(sequences)
    X, y = sequences[:, :-1], sequences[:, -1]
    print('X: \n')
    print(X)
    print('y: \n')
    print(y)

    #convert y to binary vectors
    y = to_categorical(y, num_classes=vocab_size)
    print('y: \n')
    print(y)

    #test data
    test_data = test_df['concept:name'].copy().to_list()

    test_encoded = tokenizer.texts_to_sequences([test_data])[0]

    test_sequences = list()

    for i in range(n_input, len(test_encoded)):
        test_sequence = test_encoded[i - n_input:i + 1]
        test_sequences.append(test_sequence)
    max_length = max([len(seq) for seq in test_sequences])
    test_sequences = pad_sequences(test_sequences,
                                   maxlen=max_length,
                                   padding='pre')

    test_sequences = array(test_sequences)
    test_X, test_y = test_sequences[:, :-1], test_sequences[:, -1]

    #convert y to binary vectors
    test_yl = to_categorical(test_y, num_classes=vocab_size)

    model = Sequential()
    #the first layer
    # - the largest integer (i.e. word index) in the input should be no larger than vocabulary size
    # - The Embedding layer is initialized with random weights and will learn an embedding for all of the words in the training dataset.
    # - output_dim (50): This is the size of the vector space in which words will be embedded (size of the embedding vectors). It defines the size of the output vectors from this layer for each word. For example, it could be 32 or 100 or even larger. Test different values for your problem.
    # - input_length: This is the length of input sequences (here is 2)
    # The Embedding layer has weights that are learned. If you save your model to file, this will include weights for the Embedding layer.
    # The output of the Embedding layer is a 2D vector with one embedding for each word in the input sequence of words (input document).
    # If you wish to connect a Dense layer directly to an Embedding layer, you must first flatten the 2D output matrix to a 1D vector using the Flatten layer.

    model.add(
        Embedding(vocab_size + 1, LSTM_CELLS, input_length=max_length - 1))

    model.add(LSTM(vocab_size))
    model.add(Dropout(0.1))
    model.add(Dense(vocab_size, activation='softmax'))
    opt = Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    history = model.fit(X,
                        y,
                        epochs=500,
                        verbose=0,
                        batch_size=20,
                        validation_data=(test_X, test_yl))

    print(model.summary())
    model.save('lstm_model.h5')  # creates a HDF5 file

    #del model  # deletes the existing model

    #predict sequence of n_words activities
    def generate_seq(model, tokenizer, max_length, seed_text, n_words):
        #get input activity
        in_text = seed_text
        #print('in_text',in_text,'\n')
        #for the number of activities on sequence you want to predict
        for _ in range(n_words):
            encoded = tokenizer.texts_to_sequences([in_text])[0]
            #pad if less than max text length
            encoded = pad_sequences([encoded],
                                    maxlen=max_length,
                                    padding='pre')
            #print('in text ',in_text)
            #predict one activity
            #yhat = model.predict_classes(encoded, verbose=0)
            yhat = np.argmax(model.predict(encoded), axis=-1)
            out_word = ''
            for word, index in tokenizer.word_index.items():
                #convert predicted activity to word
                if index == yhat:
                    #print('Word',word,'\n')
                    out_word = word
                    break
            #feed the next input with the sequence of activities
            in_text += ' ' + out_word

        return in_text

    #load trained model
    #model = load_model('lstm_model.h5')

    # Evaluate network
    print('LSTM Network Evaluation:\n')
    train_score = model.evaluate(X, y, verbose=0)
    print('Train Score\n', train_score)
    score = model.evaluate(test_X, test_yl, verbose=0)
    print('Test Score\n')
    print(score)

    print('History\n')
    print(history.history.keys())
    # plot loss during training
    fig = plt.figure()
    plt.subplot(211)
    plt.title('Loss')
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    fig.savefig('results/' + filename + '/' + str(len_of_points) + '/Loss.png',
                bbox_inches='tight')
    # plot accuracy during training
    fig = plt.figure()
    plt.subplot(212)
    plt.title('Accuracy')
    plt.plot(history.history['accuracy'], label='train')
    plt.plot(history.history['val_accuracy'], label='test')
    plt.legend()
    plt.show()
    fig.savefig('results/' + filename + '/' + str(len_of_points) +
                '/Accuracy.png',
                bbox_inches='tight')

    print('LSTM Results: ')
    print('\n')
    #generated_text = ''
    #sequence prediction
    for i in tokenizer.word_index:
        #print(tokenizer.index_word)
        w = generate_seq(model, tokenizer, max_length - 1, i, n_input + 1)
        #generated_text = generated_text.join('\n'+w)
        print(w)

    print('LSTM Results: ')
    print('\n')
    #for i in tokenizer.word_index:
    #	print(generate_seq(model, tokenizer, max_length-1, i , 1))
    all_data = df['concept:name'].copy().to_list()

    all_encoded = tokenizer.texts_to_sequences([all_data])[0]

    all_sequences = list()

    for i in range(n_input, len(all_encoded)):
        all_sequence = all_encoded[i - n_input:i + 1]
        all_sequences.append(all_sequence)
    max_length = max([len(seq) for seq in all_sequences])
    all_sequences = pad_sequences(all_sequences,
                                  maxlen=max_length,
                                  padding='pre')

    all_sequences = array(all_sequences)
    all_X, all_y = all_sequences[:, :-1], all_sequences[:, -1]

    #convert y to binary vectors
    all_yl = to_categorical(all_y, num_classes=vocab_size)

    #load trained model
    #model = load_model('lstm_model.h5')

    #print('Tokenizer \n',tokenizer)
    print('Tokenizer word index\n', tokenizer.word_index)

    np.set_printoptions(suppress=True)
    cnt = 0
    for i in range(len(all_X)):
        #yhat = model.predict_classes(all_X[i].reshape(1,2,1), verbose=0)
        yhat = np.argmax(model.predict(all_X[i].reshape(1, n_input, 1)),
                         axis=-1)
        df.loc[i, 'X_input'] = str(all_X[i])
        df.loc[i, 'Expected'] = all_y[i]
        df.loc[i, 'predicted'] = yhat

        #print('Expected:', all_y[i] , 'Predicted', yhat)
        prob = model.predict_proba(all_X[i].reshape(1, n_input, 1))[0]
        df.loc[i,
               'probabilities'] = ' '.join([str(elem) for elem in list(prob)])
        if (all_y[i] == yhat):
            df.loc[i, 'result'] = 'ok'
            cnt += 1
        else:
            df.loc[i, 'result'] = 'Error'

    #print(df['predicted'].replace(tokenizer.word_index))
    df.to_csv('results/' + filename + '/' + str(len_of_points) + '/resample_' +
              str(resample_dataset) + '_lstm.csv')
    print('Total successful: ', cnt, ' out of ', len(all_X), 'Percentage: ',
          cnt / len(all_X))

    # predict probabilities for test set
    yhat_probs = model.predict(test_X, verbose=0)
    # predict crisp classes for test set
    yhat_classes = model.predict_classes(test_X, verbose=0)
    print('yhat_classes\n', yhat_classes)
    # reduce to 1d array
    #yhat_probs = yhat_probs[:, 0]
    #yhat_classes = yhat_classes[:, 0]

    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(test_y, yhat_classes)
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(test_y, yhat_classes, average='weighted')
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(test_y, yhat_classes, average='weighted')
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(test_y, yhat_classes, average='weighted')
    print('F1 score: %f' % f1)

    # kappa
    kappa = cohen_kappa_score(test_y, yhat_classes)
    print('Cohens kappa: %f' % kappa)
    # ROC AUC
    #auc = roc_auc_score(test_y, yhat_probs,multi_class='ovr')
    #print('ROC AUC: %f' % auc)
    # confusion matrix
    matrix = confusion_matrix(test_y, yhat_classes)
    print(matrix)
    fig = plt.figure()
    sns.heatmap(matrix, center=True)
    plt.show()
    fig.savefig('results/' + filename + '/' + str(len_of_points) +
                '/ConfusionMatrix.png',
                bbox_inches='tight')

    #headers
    #filename - resample - len of points - train loss + Accuracy - test score
    #write results to csv
    fd = open("total_results.csv", "a+")
    row = filename + '\t' + str(resample_dataset) + '\t' + str(
        len_of_points
    ) + '\t' + str(train_score[0]) + '\t' + str(train_score[1]) + '\t' + str(
        score[0]) + '\t' + str(score[1]) + '\t' + str(accuracy) + '\t' + str(
            precision) + '\t' + str(recall) + '\t' + str(f1) + '\t' + str(
                kappa) + '\t' + '' + '\t' + json.dumps(
                    tokenizer.word_index) + '\n'
    fd.write(row)
    fd.close()