def log_to_activity_steps(log, cid_list, activity_list, stepsz): # print('activity list:\n{}'.format(activity_list)) step_list = pmutils.mk_possible_steps(activity_list, \ stepsz=stepsz) print('Number of steps: {}'.format(len(step_list))) step_mapping = {step_list[i]: i for i in range(len(step_list))} rev_step_mapping = {val: key for key, val in step_mapping.items()} step_list = np.asarray(list(map(lambda step: step_mapping[tuple(step)], \ step_list))) # create next step mapping for each step get_next_step = lambda step: list(filter(lambda step1: \ rev_step_mapping[step1][0] == \ rev_step_mapping[step][-1], \ step_list)) next_step_mapping = {step: get_next_step(step) for step in step_list} # convert log to step log dict step_log = dict() for cid in cid_list: case = log[(log['caseId'] == cid)]['activity'].tolist() case = np.append([ datrep.ARTIFICIAL_START, ], case) step_case = np.asarray([(case[i - 1], case[i]) \ for i in range(1, len(case))]) step_case = map(lambda step: step_mapping[tuple(step)], step_case) step_case = np.asarray(list(step_case)) step_log[cid] = step_case
def run(param, data_dict=None): if data_dict == None: has_data = False data_dict = dict() else: has_data = True # write params param_df = pd.DataFrame(param, index=[0]) param_df = param_df[sorted(param_df.columns.values)] param_df.to_csv(param['result_dir'] + os.sep + 'params.csv', index=False) # read log print('Reading log...') if not has_data: log = pd.read_csv(param['log_dir'], index_col=False, \ dtype=param['column_dtypes']) # filter by minimum case lengths excl_cids = log[['caseId', 'activity']] excl_cids = excl_cids.groupby('caseId', as_index=False).count() min_len = param['min_case_len'] excl_cids = excl_cids[(excl_cids['activity'] <= min_len)] excl_cids = excl_cids.loc[:, ['caseId']] log = log[~(log['caseId'].isin(excl_cids))] data_dict['log'] = log data_dict['excl_cids'] = excl_cids else: log = data_dict['log'] excl_cids = data_dict['excl_cids'] excl_cids.to_csv(param['result_dir'] + os.sep + 'excluded.csv', \ index=False) # creating all the itemlists print('Creating itemlists...') if not has_data: # activity list activity_list = log['activity'].unique().tolist() activity_list = sorted(activity_list) # steps from activity list step_dict = dict() for sz in range(2, param['stepsz'] + 1): step_list_i = rutils.mk_possible_steps(activity_list, sz) step_dict[sz] = step_list_i # caseid list cid_list = log['caseId'].unique() cid_list.sort() # create next step mapping next_step_mapping = dict() for act in activity_list: possible = filter(lambda s: s.split('->')[0] == act, step_dict[2]) next_step_mapping[act] = np.asarray(list(possible)) # activity lifecycle list activity_lc_list = log[(log['activity'] != 'ARTIFICIAL_START')] activity_lc_list = activity_lc_list[['activity', 'lifecycle']] activity_lc_list = activity_lc_list.drop_duplicates().values activity_lc_list = list(map(lambda c: '+'.join(c), activity_lc_list)) # add back ARTIFICIAL_START to the mix activity_lc_list = np.append(activity_lc_list, ['ARTIFICIAL_START']) activity_lc_list.sort() # step dict with lifecycle step_dict_lc = dict() for sz in range(2, param['stepsz'] + 2): step_list_i = rutils.mk_possible_steps(activity_lc_list, sz) step_dict_lc[sz] = step_list_i data_dict['activity_list'] = activity_list data_dict['cid_list'] = cid_list data_dict['step_dict'] = step_dict data_dict['next_step_mapping'] = next_step_mapping data_dict['activity_lc_list'] = activity_lc_list data_dict['step_dict_lc'] = step_dict_lc else: activity_list = data_dict['activity_list'] cid_list = data_dict['cid_list'] step_dict = data_dict['step_dict'] next_step_mapping = data_dict['next_step_mapping'] activity_lc_list = data_dict['activity_lc_list'] step_dict_lc = data_dict['step_dict_lc'] if param['feature'] == 'only_setup': return data_dict print('Splitting train and test set...') if not has_data: # train_cid_list, test_cid_list = fmexp.train_test_split(cid_list, \ # param['train_perc']) train_cid_list, valid_cid_list, test_cid_list = \ fmexp.train_valid_test_split(cid_list, param['train_perc'], \ param['valid_perc']) data_dict['train_cid_list'] = train_cid_list data_dict['valid_cid_list'] = valid_cid_list data_dict['test_cid_list'] = test_cid_list else: train_cid_list = data_dict['train_cid_list'] valid_cid_list = data_dict['valid_cid_list'] test_cid_list = data_dict['test_cid_list'] print('Building train matrix...') time_dict = dict() time_dict['build_train'] = time.time() if not has_data: train_log = log[(log['caseId'].isin(train_cid_list))] train_X_datalist, train_X_row_inds, train_X_col_inds, \ train_X_shape, train_y_datalist, id_list = \ datrep.step_log_to_fm_format_all(param['feature'], \ train_log, param['stepsz'], \ cid_list, step_dict, \ step_dict_lc, \ next_step_mapping, 3, \ param['negative_samples'], \ param['seed'], \ param['normalize']) print('train_X shape: {}'.format(train_X_shape)) train_X = csc_matrix((train_X_datalist, (train_X_row_inds, \ train_X_col_inds)), train_X_shape) train_y = train_y_datalist data_dict['train_X'] = train_X data_dict['train_y'] = train_y sparsity_dict = dict() train_sparsity = len(train_X_datalist) * 1. / train_X_shape.prod() sparsity_dict['train'] = train_sparsity data_dict['sparsity'] = sparsity_dict else: train_X = data_dict['train_X'] train_y = data_dict['train_y'] sparsity_dict = data_dict['sparsity'] time_dict['build_train'] = time.time() - time_dict['build_train'] print('Took {} seconds to build train matrix.'\ .format(time_dict['build_train'])) print('Train matrix shape: {}'.format(train_X.shape)) print('Building valid matrix...') time_dict['build_valid'] = time.time() if not has_data: valid_log = log[(log['caseId'].isin(valid_cid_list))] valid_X_datalist, valid_X_row_inds, valid_X_col_inds, \ valid_X_shape, valid_y_datalist, _ = \ datrep.step_log_to_fm_format_all(param['feature'], \ valid_log, param['stepsz'], \ cid_list, step_dict, \ step_dict_lc, \ next_step_mapping, \ param['minpartialsz'], \ negative_samples=-1, \ seed=param['seed'], \ normalize=param['normalize']) valid_X = csc_matrix((valid_X_datalist, (valid_X_row_inds, \ valid_X_col_inds)), valid_X_shape) valid_y = valid_y_datalist data_dict['valid_X'] = valid_X data_dict['valid_y'] = valid_y valid_sparsity = len(valid_X_datalist) * 1. / valid_X_shape.prod() sparsity_dict['valid'] = valid_sparsity else: valid_X = data_dict['valid_X'] valid_y = data_dict['valid_y'] time_dict['build_valid'] = time.time() - time_dict['build_valid'] print('Took {} seconds to build valid matrix.'\ .format(time_dict['build_valid'])) print('Valid matrix shape: {}'.format(valid_X.shape)) print('Building test matrix...') time_dict['build_test'] = time.time() if not has_data: test_log = log[(log['caseId'].isin(test_cid_list))] test_X_datalist, test_X_row_inds, test_X_col_inds, \ test_X_shape, test_y_datalist, pred_id_list = \ datrep.step_log_to_fm_format_all(param['feature'], \ test_log, param['stepsz'], \ cid_list, step_dict, \ step_dict_lc, \ next_step_mapping, \ param['minpartialsz'], \ negative_samples=-1, \ seed=param['seed'], \ normalize=param['normalize']) test_X = csc_matrix((test_X_datalist, (test_X_row_inds, \ test_X_col_inds)), test_X_shape) test_y = test_y_datalist data_dict['test_X'] = test_X data_dict['test_y'] = test_y data_dict['pred_id_list'] = pred_id_list test_sparsity = len(test_X_datalist) * 1. / test_X_shape.prod() sparsity_dict['test'] = test_sparsity else: test_X = data_dict['test_X'] test_y = data_dict['test_y'] pred_id_list = data_dict['pred_id_list'] time_dict['build_test'] = time.time() - time_dict['build_test'] print('Took {} seconds to build test matrix.'\ .format(time_dict['build_test'])) print('Test matrix shape: {}'.format(test_X.shape)) # build fm model fm_model = fmexp.build_model(param['model_type'], param) # model selection on number of iterations best_it, time_df, rmse_df = fmexp.model_select(train_X, train_y, \ valid_X, valid_y, \ fm_model, \ param['max_iter'], \ param['step_size_als'], \ param['stop_delta']) print('Best n_iter: {}'.format(best_it)) # retrain fm model to the optimal num of iterations print('Retraining model for prediction...') time_dict['retrain'] = time.time() fm_model = fmexp.build_model(param['model_type'], param) fm_model.fit(train_X, train_y) fm_model.fit(train_X, train_y, n_more_iter=best_it) time_dict['retrain'] = time.time() - time_dict['retrain'] print('Took {} seconds to retrain model.'\ .format(time_dict['retrain'])) print('Making predictions...') # y_pred, time_df = fmexp.run_experiment(train_X, train_y, test_X, fm_model) time_dict['test'] = time.time() y_pred = fm_model.predict(test_X) time_dict['test'] = time.time() - time_dict['test'] print('Finished predictions!') for key, item in time_dict.items(): time_df[key] = item print('Ranking to get the top predictions...') time_dict['picking_preds'] = time.time() top_pred_df = fmeval.rank_top_prediction_dense(test_y, pred_id_list, y_pred) time_dict['picking_preds'] = time.time() - time_dict['picking_preds'] print('Took {} seconds to pick top predictions.'\ .format(time_dict['picking_preds'])) start_export = time.time() # fmexp.export_results_light(top_pred_df, time_df, result_dir) top_pred_df.to_csv(result_dir + os.sep + 'topYPred.csv', index=False) time_df = time_df[sorted(time_df.columns)] time_df.to_csv(result_dir + os.sep + 'time.csv', index=False) rmse_df.to_csv(result_dir + os.sep + 'learnRate.csv', index=False) sparsity_df = pd.DataFrame(sparsity_dict, index=[0]) sparsity_df = sparsity_df[sorted(sparsity_df.columns)] sparsity_df.to_csv(result_dir + os.sep + 'sparsity.csv', index=False) # export train, valid, and test cid list train_cid_df = pd.DataFrame({'caseId': train_cid_list}) valid_cid_df = pd.DataFrame({'caseId': valid_cid_list}) test_cid_df = pd.DataFrame({'caseId': test_cid_list}) train_cid_df.to_csv(result_dir + os.sep + 'trainCidList.csv', index=False) valid_cid_df.to_csv(result_dir + os.sep + 'validCidList.csv', index=False) test_cid_df.to_csv(result_dir + os.sep + 'testCidList.csv', index=False) diff_export = time.time() - start_export print('Took {} seconds to export results.'.format(diff_export)) return data_dict
def run(param, data_dict=None): if data_dict == None: has_data = False data_dict = dict() else: has_data = True # write params param_df = pd.DataFrame(param, index=[0]) param_df = param_df[sorted(param_df.columns.values)] param_df.to_csv(param['result_dir'] + os.sep + 'params.csv', index=False) # read log print('Reading log...') if not has_data: log = pd.read_csv(param['log_dir'], index_col=False, \ dtype=param['column_dtypes']) # filter by minimum case lengths excl_cids = log[['caseId', 'activity']] excl_cids = excl_cids.groupby('caseId', as_index=False).count() min_len = param['min_case_len'] excl_cids = excl_cids[(excl_cids['activity'] <= min_len)] excl_cids = excl_cids.loc[:, ['caseId']] log = log[~(log['caseId'].isin(excl_cids))] data_dict['log'] = log data_dict['excl_cids'] = excl_cids else: log = data_dict['log'] excl_cids = data_dict['excl_cids'] excl_cids.to_csv(param['result_dir'] + os.sep + 'excluded.csv', \ index=False) # creating all the itemlists print('Creating itemlists...') if not has_data: # activity list activity_list = log['activity'].unique().tolist() activity_list = sorted(activity_list) # steps from activity list step_dict = dict() for sz in range(2, param['stepsz'] + 1): step_list_i = rutils.mk_possible_steps(activity_list, sz) step_dict[sz] = step_list_i # caseid list cid_list = log['caseId'].unique() cid_list.sort() # create next step mapping next_step_mapping = dict() for act in activity_list: possible = filter(lambda s: s.split('->')[0] == act, step_dict[2]) next_step_mapping[act] = np.asarray(list(possible)) data_dict['activity_list'] = activity_list data_dict['cid_list'] = cid_list data_dict['step_dict'] = step_dict data_dict['next_step_mapping'] = next_step_mapping else: activity_list = data_dict['activity_list'] cid_list = data_dict['cid_list'] step_dict = data_dict['step_dict'] next_step_mapping = data_dict['next_step_mapping'] if param['feature'] == 'only_setup': return data_dict print('Splitting train and test set...') if not has_data: train_cid_list, test_cid_list = fmexp.train_test_split(cid_list, \ param['train_perc']) data_dict['train_cid_list'] = train_cid_list data_dict['test_cid_list'] = test_cid_list else: train_cid_list = data_dict['train_cid_list'] test_cid_list = data_dict['test_cid_list'] print('Building train matrix...') time_dict = dict() time_dict['build_train'] = time.time() if not has_data: train_log = log[(log['caseId'].isin(train_cid_list))] train_X_datalist, train_X_row_inds, train_X_col_inds, \ train_X_shape, train_y_datalist, id_list = \ datrep.step_log_to_fm_format_all(param['feature'], \ log, param['stepsz'], \ cid_list, step_dict, \ next_step_mapping, 3, \ param['negative_samples'], \ param['seed'], \ param['normalize']) print('train_X shape: {}'.format(train_X_shape)) train_X = csc_matrix((train_X_datalist, (train_X_row_inds, \ train_X_col_inds)), train_X_shape) train_y = train_y_datalist data_dict['train_X'] = train_X data_dict['train_y'] = train_y sparsity_dict = dict() train_sparsity = len(train_X_datalist) * 1. / train_X_shape.prod() sparsity_dict['train'] = train_sparsity data_dict['sparsity'] = sparsity_dict else: train_X = data_dict['train_X'] train_y = data_dict['train_y'] sparsity_dict = data_dict['sparsity'] time_dict['build_train'] = time.time() - time_dict['build_train'] print('Took {} seconds to build train matrix.'\ .format(time_dict['build_train'])) print('Train matrix shape: {}'.format(train_X.shape)) print('Building test matrix...') time_dict['build_test'] = time.time() if not has_data: test_log = log[(log['caseId'].isin(test_cid_list))] test_X_datalist, test_X_row_inds, test_X_col_inds, \ test_X_shape, test_y_datalist, pred_id_list = \ datrep.step_log_to_fm_format_all(param['feature'], \ test_log, param['stepsz'], \ cid_list, step_dict, \ next_step_mapping, \ param['minpartialsz'], \ negative_samples=-1, \ seed=param['seed'], \ normalize=param['normalize']) test_X = csc_matrix((test_X_datalist, (test_X_row_inds, \ test_X_col_inds)), test_X_shape) test_y = test_y_datalist data_dict['test_X'] = test_X data_dict['test_y'] = test_y data_dict['pred_id_list'] = pred_id_list test_sparsity = len(test_X_datalist) * 1. / test_X_shape.prod() sparsity_dict['test'] = test_sparsity else: test_X = data_dict['test_X'] test_y = data_dict['test_y'] pred_id_list = data_dict['pred_id_list'] time_dict['build_test'] = time.time() - time_dict['build_test'] print('Took {} seconds to build test matrix.'\ .format(time_dict['build_test'])) print('Test matrix shape: {}'.format(test_X.shape)) # build fm model fm_model = fmexp.build_model(param['model_type'], param) print('Making predictions...') y_pred, time_df = fmexp.run_experiment(train_X, train_y, test_X, fm_model) print('Finished predictions!') for key, item in time_dict.items(): time_df[key] = item print('Ranking to get the top predictions...') time_dict['picking_preds'] = time.time() top_pred_df = fmeval.rank_top_prediction_dense(test_y, pred_id_list, y_pred) time_dict['picking_preds'] = time.time() - time_dict['picking_preds'] print('Took {} seconds to pick top predictions.'\ .format(time_dict['picking_preds'])) start_export = time.time() # fmexp.export_results_light(top_pred_df, time_df, result_dir) top_pred_df.to_csv(result_dir + os.sep + 'topYPred.csv', index=False) time_df = time_df[sorted(time_df.columns)] time_df.to_csv(result_dir + os.sep + 'time.csv', index=False) diff_export = time.time() - start_export sparsity_df = pd.DataFrame(sparsity_dict, index=[0]) sparsity_df = sparsity_df[sorted(sparsity_df.columns)] sparsity_df.to_csv(result_dir + os.sep + 'sparsity.csv', index=False) print('Took {} seconds to export results.'.format(diff_export)) return data_dict
'step_impact':rstepimpact.log_to_fm_format} return func_mapping[func](*args, **kwargs) if __name__ == '__main__': # load the small bpic2012 dataset log_small = './dataset/bpic2012/bpic2012FirstHundred.csv' log_small_df = pd.read_csv(log_small, index_col=False) print('Number of cases: {}, number of events: {}'\ .format(log_small_df['caseId'].unique().shape[0], \ log_small_df['activity'].shape[0])) # caseid list and activity list caseid_list = log_small_df['caseId'].unique().tolist() activity_list = log_small_df['activity'].unique().tolist() activity_list = np.append([ARTIFICIAL_START,], activity_list) activity_list = sorted(activity_list) step_list = rutils.mk_possible_steps(activity_list, stepsz=2) print('num_of_acts: {} \n{}'.format(len(activity_list), activity_list)) print('num_of_steps: {} \n{}'.format(len(step_list), step_list)) print('type step: {} in steplist of type: {}'.format(type(step_list[0]), \ type(step_list))) # create mapping between stepid and step step_mapping = {step_list[i]:i for i in range(len(step_list))} rev_step_mapping = {val:key for key, val in step_mapping.items()} # map everything step_list = np.asarray(list(map(lambda step: step_mapping[tuple(step)],\ step_list))) # create next step mapping for each step get_next_step = \