def _evaluate_logs(parms, log, sim_log, rep_num):
     """Reads the simulation results stats
     Args:
         settings (dict): Path to jar and file names
         rep (int): repetition number
     """
     # print('Reading repetition:', (rep+1), sep=' ')
     sim_values = list()
     log = copy.deepcopy(log)
     log = log[~log.task.isin(['Start', 'End'])]
     log['source'] = 'log'
     log.rename(columns={'user': '******'}, inplace=True)
     log['caseid'] = log['caseid'].astype(str)
     log['caseid'] = 'Case' + log['caseid']
     evaluator = sim.SimilarityEvaluator(log,
                                         sim_log,
                                         parms['gl'],
                                         max_cases=1000)
     metrics = [parms['gl']['sim_metric']]
     if 'add_metrics' in parms['gl'].keys():
         metrics = list(set(list(parms['gl']['add_metrics']) + metrics))
     for metric in metrics:
         evaluator.measure_distance(metric)
         sim_values.append({**{'run_num': rep_num}, **evaluator.similarity})
     return sim_values
Ejemplo n.º 2
0
 def evaluate(settings, data, sim_log):
     """Reads the simulation results stats
     Args:
         settings (dict): Path to jar and file names
         rep (int): repetition number
     """
     rep = (sim_log.iloc[0].run_num)
     sim_values = list()
     evaluator = sim.SimilarityEvaluator(data,
                                         sim_log,
                                         settings,
                                         max_cases=1000)
     evaluator.measure_distance('dl')
     sim_values.append({**{'run_num': rep}, **evaluator.similarity})
     return sim_values
Ejemplo n.º 3
0
def timeseries_test():
    parms = load_parms()
    serie1 = pd.read_csv(os.path.join('tests', 'fixtures', 'ia_valdn.csv'))
    serie2 = pd.read_csv(os.path.join('tests', 'fixtures', 'ia_valdn_gen.csv'))
    serie1 = serie1[['caseid', 'timestamp']]
    serie1['timestamp'] = pd.to_datetime(serie1['timestamp'],
                                         format="%Y-%m-%d %H:%M:%S.%f")
    serie2 = serie2[['caseid', 'timestamp']]
    serie2['timestamp'] = pd.to_datetime(serie2['timestamp'],
                                         format="%Y-%m-%d %H:%M:%S.%f")

    evaluation = sim.SimilarityEvaluator(serie1, serie2, parms, dtype='serie')
    evaluation.measure_distance('day_emd')
    print(evaluation.similarity)
    evaluation.measure_distance('day_hour_emd')
    print(evaluation.similarity)
    evaluation.measure_distance('cal_emd')
    print(evaluation.similarity)
Ejemplo n.º 4
0
 def _evaluate_predict_log(parms, log, sim_log, rep_num):
     """Reads the simulation results stats
     Args:
         settings (dict): Path to jar and file names
         rep (int): repetition number
     """
     sim_values = list()
     log = copy.deepcopy(log)
     log = log[~log.task.isin(['Start', 'End', 'start', 'end'])]
     log['caseid'] = log['caseid'].astype(str)
     log['caseid'] = 'Case' + log['caseid']
     sim_log = sim_log[~sim_log.task.isin(['Start', 'End', 'start', 'end'])]
     evaluator = ev.SimilarityEvaluator(log, sim_log, parms)
     metrics = ['tsd', 'day_hour_emd', 'log_mae', 'dl', 'mae']
     for metric in metrics:
         evaluator.measure_distance(metric)
         sim_values.append({**{'run_num': rep_num}, **evaluator.similarity})
     return sim_values
Ejemplo n.º 5
0
def log_test_3():
    parms = load_parms()
    event_log = pd.read_csv(os.path.join('tests', 'fixtures', 'event_log.csv'))
    event_log['start_timestamp'] = pd.to_datetime(
        event_log['start_timestamp'], format="%Y-%m-%d %H:%M:%S.%f")
    event_log['end_timestamp'] = pd.to_datetime(event_log['end_timestamp'],
                                                format="%Y-%m-%d %H:%M:%S.%f")
    event_log = event_log[~event_log.task.isin(['Start', 'End'])]
    if pd.api.types.is_numeric_dtype(event_log['caseid']):
        event_log['caseid'] = event_log['caseid'] + 1
        event_log['caseid'] = event_log['caseid'].astype(str)
    event_log['caseid'] = 'Case' + event_log['caseid']
    # Duplicate
    event_log_2 = deepcopy(event_log)
    # Add columns
    evaluation = sim.SimilarityEvaluator(event_log,
                                         event_log_2,
                                         parms,
                                         max_cases=100)
    measure(evaluation)
Ejemplo n.º 6
0
def log_test_2():
    parms = load_parms()
    event_log = pd.read_csv(
        os.path.join('tests', 'fixtures',
                     'BPI_Challenge_2012_W_Two_TS_test.csv'))
    event_log['start_timestamp'] = pd.to_datetime(
        event_log['start_timestamp'], format="%Y-%m-%d %H:%M:%S.%f")
    event_log['end_timestamp'] = pd.to_datetime(event_log['end_timestamp'],
                                                format="%Y-%m-%d %H:%M:%S.%f")
    event_log = event_log[~event_log.task.isin(['Start', 'End'])]
    event_log['caseid'] = event_log['caseid'] + 1
    max_c = event_log.caseid.max()
    event_log_c = deepcopy(event_log)
    event_log_c['caseid'] = event_log_c['caseid'] + max_c
    event_log = pd.concat([event_log, event_log_c], axis=0, ignore_index=True)
    event_log['caseid'] = event_log['caseid'].astype(str)
    event_log['caseid'] = 'Case' + event_log['caseid']
    # Duplicate
    event_log_2 = deepcopy(event_log)
    # Add columns
    evaluation = sim.SimilarityEvaluator(event_log, event_log_2, parms)
    measure(evaluation)
        def create_model(window, ia_times, ia_valdn, parms):
            try:
                hist_range = [0, int((window * 3600))]
                day_hour = lambda x: x['timestamp'].hour
                ia_times['hour'] = ia_times.apply(day_hour, axis=1)
                date = lambda x: x['timestamp'].date()
                ia_times['date'] = ia_times.apply(date, axis=1)
                # create time windows
                i = 0
                daily_windows = dict()
                for x in range(24):
                    if x % window == 0:
                        i += 1
                    daily_windows[x] = i
                ia_times = ia_times.merge(
                    pd.DataFrame.from_dict(daily_windows,
                                           orient='index').rename_axis('hour'),
                    on='hour',
                    how='left').rename(columns={0: 'window'})
                inter_arrival = list()
                for key, group in ia_times.groupby(
                    ['window', 'date', 'weekday']):
                    w_df = group.copy()
                    w_df = w_df.reset_index()
                    prev_time = w_df.timestamp.min().floor(freq='H')
                    for i, item in w_df.iterrows():
                        inter_arrival.append({
                            'window':
                            key[0],
                            'weekday':
                            item.weekday,
                            'intertime':
                            (item.timestamp - prev_time).total_seconds(),
                            'date':
                            item.date
                        })
                        prev_time = item.timestamp
                distribs = dict()
                for key, group in pd.DataFrame(inter_arrival).groupby(
                    ['window', 'weekday']):
                    intertime = group.intertime
                    if len(intertime) > 2:
                        intertime = intertime[intertime.between(
                            intertime.quantile(.15), intertime.quantile(.85))]
                    distrib = dist_best(intertime, hist_range)
                    # TODO: averiguar porque funciona con la mitad de los casos???
                    number = group.groupby('date').intertime.count()
                    if len(number) > 2:
                        number = number[number.between(number.quantile(.15),
                                                       number.quantile(.85))]
                    # distrib['num'] = int(number.median()/2)
                    distrib['num'] = ceil(number.median() / 2)
                    # distrib['num'] = int(number.median())
                    if distrib['dist'] == 'lognorm':
                        distrib['mean'] = np.mean(group.intertime)
                        distrib['var'] = np.var(group.intertime)
                    distribs[str(key[0])] = {str(key[1]): distrib}
                model = {
                    'window': window,
                    'daily_windows':
                    {str(k): v
                     for k, v in daily_windows.items()},
                    'distribs': distribs
                }

                # validation
                # modify number of instances in the model
                num_inst = len(ia_valdn.caseid.unique())
                # get minimum date
                start_time = (ia_valdn.timestamp.min().strftime(
                    "%Y-%m-%dT%H:%M:%S.%f+00:00"))
                times = generate_traces(model, num_inst, start_time)
                # ia_valdn = ia_valdn[['caseid', 'timestamp']]
                # times = times[['caseid', 'timestamp']]
                evaluation = sim.SimilarityEvaluator(ia_valdn,
                                                     times,
                                                     parms,
                                                     0,
                                                     dtype='serie')
                evaluation.measure_distance('hour_emd')
                return {
                    'model': model,
                    'loss': evaluation.similarity['sim_val']
                }
            except Exception:
                traceback.print_exc()
                return {'model': [], 'loss': 1}