def _evaluate_logs(parms, log, sim_log, rep_num): """Reads the simulation results stats Args: settings (dict): Path to jar and file names rep (int): repetition number """ # print('Reading repetition:', (rep+1), sep=' ') sim_values = list() log = copy.deepcopy(log) log = log[~log.task.isin(['Start', 'End'])] log['source'] = 'log' log.rename(columns={'user': '******'}, inplace=True) log['caseid'] = log['caseid'].astype(str) log['caseid'] = 'Case' + log['caseid'] evaluator = sim.SimilarityEvaluator(log, sim_log, parms['gl'], max_cases=1000) metrics = [parms['gl']['sim_metric']] if 'add_metrics' in parms['gl'].keys(): metrics = list(set(list(parms['gl']['add_metrics']) + metrics)) for metric in metrics: evaluator.measure_distance(metric) sim_values.append({**{'run_num': rep_num}, **evaluator.similarity}) return sim_values
def evaluate(settings, data, sim_log): """Reads the simulation results stats Args: settings (dict): Path to jar and file names rep (int): repetition number """ rep = (sim_log.iloc[0].run_num) sim_values = list() evaluator = sim.SimilarityEvaluator(data, sim_log, settings, max_cases=1000) evaluator.measure_distance('dl') sim_values.append({**{'run_num': rep}, **evaluator.similarity}) return sim_values
def timeseries_test(): parms = load_parms() serie1 = pd.read_csv(os.path.join('tests', 'fixtures', 'ia_valdn.csv')) serie2 = pd.read_csv(os.path.join('tests', 'fixtures', 'ia_valdn_gen.csv')) serie1 = serie1[['caseid', 'timestamp']] serie1['timestamp'] = pd.to_datetime(serie1['timestamp'], format="%Y-%m-%d %H:%M:%S.%f") serie2 = serie2[['caseid', 'timestamp']] serie2['timestamp'] = pd.to_datetime(serie2['timestamp'], format="%Y-%m-%d %H:%M:%S.%f") evaluation = sim.SimilarityEvaluator(serie1, serie2, parms, dtype='serie') evaluation.measure_distance('day_emd') print(evaluation.similarity) evaluation.measure_distance('day_hour_emd') print(evaluation.similarity) evaluation.measure_distance('cal_emd') print(evaluation.similarity)
def _evaluate_predict_log(parms, log, sim_log, rep_num): """Reads the simulation results stats Args: settings (dict): Path to jar and file names rep (int): repetition number """ sim_values = list() log = copy.deepcopy(log) log = log[~log.task.isin(['Start', 'End', 'start', 'end'])] log['caseid'] = log['caseid'].astype(str) log['caseid'] = 'Case' + log['caseid'] sim_log = sim_log[~sim_log.task.isin(['Start', 'End', 'start', 'end'])] evaluator = ev.SimilarityEvaluator(log, sim_log, parms) metrics = ['tsd', 'day_hour_emd', 'log_mae', 'dl', 'mae'] for metric in metrics: evaluator.measure_distance(metric) sim_values.append({**{'run_num': rep_num}, **evaluator.similarity}) return sim_values
def log_test_3(): parms = load_parms() event_log = pd.read_csv(os.path.join('tests', 'fixtures', 'event_log.csv')) event_log['start_timestamp'] = pd.to_datetime( event_log['start_timestamp'], format="%Y-%m-%d %H:%M:%S.%f") event_log['end_timestamp'] = pd.to_datetime(event_log['end_timestamp'], format="%Y-%m-%d %H:%M:%S.%f") event_log = event_log[~event_log.task.isin(['Start', 'End'])] if pd.api.types.is_numeric_dtype(event_log['caseid']): event_log['caseid'] = event_log['caseid'] + 1 event_log['caseid'] = event_log['caseid'].astype(str) event_log['caseid'] = 'Case' + event_log['caseid'] # Duplicate event_log_2 = deepcopy(event_log) # Add columns evaluation = sim.SimilarityEvaluator(event_log, event_log_2, parms, max_cases=100) measure(evaluation)
def log_test_2(): parms = load_parms() event_log = pd.read_csv( os.path.join('tests', 'fixtures', 'BPI_Challenge_2012_W_Two_TS_test.csv')) event_log['start_timestamp'] = pd.to_datetime( event_log['start_timestamp'], format="%Y-%m-%d %H:%M:%S.%f") event_log['end_timestamp'] = pd.to_datetime(event_log['end_timestamp'], format="%Y-%m-%d %H:%M:%S.%f") event_log = event_log[~event_log.task.isin(['Start', 'End'])] event_log['caseid'] = event_log['caseid'] + 1 max_c = event_log.caseid.max() event_log_c = deepcopy(event_log) event_log_c['caseid'] = event_log_c['caseid'] + max_c event_log = pd.concat([event_log, event_log_c], axis=0, ignore_index=True) event_log['caseid'] = event_log['caseid'].astype(str) event_log['caseid'] = 'Case' + event_log['caseid'] # Duplicate event_log_2 = deepcopy(event_log) # Add columns evaluation = sim.SimilarityEvaluator(event_log, event_log_2, parms) measure(evaluation)
def create_model(window, ia_times, ia_valdn, parms): try: hist_range = [0, int((window * 3600))] day_hour = lambda x: x['timestamp'].hour ia_times['hour'] = ia_times.apply(day_hour, axis=1) date = lambda x: x['timestamp'].date() ia_times['date'] = ia_times.apply(date, axis=1) # create time windows i = 0 daily_windows = dict() for x in range(24): if x % window == 0: i += 1 daily_windows[x] = i ia_times = ia_times.merge( pd.DataFrame.from_dict(daily_windows, orient='index').rename_axis('hour'), on='hour', how='left').rename(columns={0: 'window'}) inter_arrival = list() for key, group in ia_times.groupby( ['window', 'date', 'weekday']): w_df = group.copy() w_df = w_df.reset_index() prev_time = w_df.timestamp.min().floor(freq='H') for i, item in w_df.iterrows(): inter_arrival.append({ 'window': key[0], 'weekday': item.weekday, 'intertime': (item.timestamp - prev_time).total_seconds(), 'date': item.date }) prev_time = item.timestamp distribs = dict() for key, group in pd.DataFrame(inter_arrival).groupby( ['window', 'weekday']): intertime = group.intertime if len(intertime) > 2: intertime = intertime[intertime.between( intertime.quantile(.15), intertime.quantile(.85))] distrib = dist_best(intertime, hist_range) # TODO: averiguar porque funciona con la mitad de los casos??? number = group.groupby('date').intertime.count() if len(number) > 2: number = number[number.between(number.quantile(.15), number.quantile(.85))] # distrib['num'] = int(number.median()/2) distrib['num'] = ceil(number.median() / 2) # distrib['num'] = int(number.median()) if distrib['dist'] == 'lognorm': distrib['mean'] = np.mean(group.intertime) distrib['var'] = np.var(group.intertime) distribs[str(key[0])] = {str(key[1]): distrib} model = { 'window': window, 'daily_windows': {str(k): v for k, v in daily_windows.items()}, 'distribs': distribs } # validation # modify number of instances in the model num_inst = len(ia_valdn.caseid.unique()) # get minimum date start_time = (ia_valdn.timestamp.min().strftime( "%Y-%m-%dT%H:%M:%S.%f+00:00")) times = generate_traces(model, num_inst, start_time) # ia_valdn = ia_valdn[['caseid', 'timestamp']] # times = times[['caseid', 'timestamp']] evaluation = sim.SimilarityEvaluator(ia_valdn, times, parms, 0, dtype='serie') evaluation.measure_distance('hour_emd') return { 'model': model, 'loss': evaluation.similarity['sim_val'] } except Exception: traceback.print_exc() return {'model': [], 'loss': 1}