def event_type_map_eval_ml_metrics(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "event type MAP evaluation:") print_and_log( logger, "event type MAP: {}".format( ml_metrics.mapk(result['gt_all_event_id'], result['pred_all_event_id']))) k_list = [] for i in range(len(result['chain_name'])): k_list.append(len(result['gt_all_event_type'][i])) k_list = sorted(list(set(k_list))) k_list.remove(0) print_and_log(logger, "all possible k: {}".format(k_list)) for k in k_list: map_at_k = ml_metrics.mapk(result['gt_all_event_id'], result['pred_all_event_id'], k) print_and_log(logger, "event type MAP@{}: {}".format(int(k), map_at_k)) return result
def event_type_map_eval_given_gt(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "event type MAP evaluation:") result['et_ap'] = list() for a, p in zip(result['gt_all_event_id'], result['pred_all_event_id']): AP = precision_score(a, p, average='macro') result['et_ap'].append(AP) map_re = np.mean(result['et_ap']) result['et_map'] = map_re print_and_log(logger, "event type MAP: {}".format(round(map_re, 4))) return result
def event_type_map_eval(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "event type MAP evaluation:") result['et_ap'] = list() for a, p in zip(result['gt_all_event_id'], result['pred_all_event_id']): AP = compute_AP(a, p) result['et_ap'].append(AP) map_re = np.mean(result['et_ap']) result['et_map'] = map_re print_and_log(logger, "event type MAP: {}".format(round(map_re, 4))) return result
def user_cluster_map_eval(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "user cluster MAP evaluation:") result['uc_ap'] = list() for a, p in zip(result['gt_all_user_cluster'], result['pred_all_user_cluster']): AP = compute_AP(a, p) result['uc_ap'].append(AP) map_re = np.mean(result['uc_ap']) result['uc_map'] = map_re print_and_log(logger, "user cluster MAP: {}".format(round(map_re, 4))) return result
def event_type_categorical_accuracy_eval_given_gt(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "event type categorical accuracy evaluation:") y_true = [] y_pred = [] for a, p in zip(result['gt_all_event_id'], result['pred_all_event_id']): y_true += a y_pred += p result['et_cate'] = accuracy_score(y_true, y_pred) print_and_log( logger, "event type categorical accuracy: {}".format( round(result['et_cate'], 4))) return result
def user_cluster_categorical_accuracy_eval_given_gt(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "user cluster categorical accuracy evaluation:") y_true = [] y_pred = [] for a, p in zip(result['gt_all_user_cluster'], result['pred_all_user_cluster']): y_true += a y_pred += p result['uc_cate'] = accuracy_score(y_true, y_pred) print_and_log( logger, "user cluster categorical accuracy: {}".format( round(result['uc_cate'], 4))) return result
def time_delay_overall_evaluation(config, logger, result, result_save_path, plot_ts=True, chain_length_eval=True): print_and_log(logger, "====================================") print_and_log(logger, "time delay evaluation:") # statistics pred_all = [] gt_all = [] avg_dtw = [] avg_mse = [] result["td_DTW"] = list() result["td_MSE"] = list() for i in range(len(result['chain_name'])): pred_time_delay = result['pred_all_time_delay'][i] gt_time_delay = result['gt_all_time_delay'][i] if len(pred_time_delay) == 0: pred_time_delay = [-1] if len(gt_time_delay) == 0: gt_time_delay = [-1] avg_dtw.append(fastdtw(gt_time_delay, pred_time_delay)[0]) result["td_DTW"].append(avg_dtw[-1]) if len(gt_time_delay) == len(pred_time_delay): avg_mse.append(mean_squared_error(gt_time_delay, pred_time_delay)) result["td_MSE"].append(avg_mse[-1]) else: result["td_MSE"].append('null') if len(result['pred_all_time_delay'][i]) != 0: pred_all += pred_time_delay if len(result['gt_all_time_delay'][i]) != 0: gt_all += gt_time_delay print_and_log(logger, "Average DTW: {}".format(round(np.mean(avg_dtw), 4))) if config['given_gt']: print_and_log(logger, "Average MSE: {}".format(np.mean(avg_mse))) print_and_log( logger, "MAX predicted: {}, ground truth: {}".format(round(max(pred_all), 4), round(max(gt_all), 4))) print_and_log( logger, "MIN predicted: {}, ground truth: {}".format(round(min(pred_all), 4), round(min(gt_all), 4))) print_and_log( logger, "MEAN predicted: {}, ground truth: {}".format( round(np.mean(pred_all), 4), round(np.mean(gt_all), 4))) print_and_log( logger, "STD predicted: {}, ground truth: {}".format( round(np.std(pred_all), 4), round(np.std(gt_all), 4))) # chain length evaluation if chain_length_eval: length_mae = [] length_stat = dict() length_stat["gt_chain_0"] = 0 length_stat["gt_chain_1"] = 0 length_stat["Same_as_gt"] = 0 length_stat["diff_1_to_10"] = 0 length_stat["diff_10_to_100"] = 0 length_stat["diff_100+"] = 0 if 'chains_applied_keep_pred' in result: length_stat["applied_threshold"] = len( result["chains_applied_keep_pred"]) sim_start = config['sim_period']['start'].split('T')[0] sim_end = config['sim_period']['end'].split('T')[0] if plot_ts: time_delay_plot_save_path = os.path.join(result_save_path, "time_delay_plot") if not os.path.exists(time_delay_plot_save_path): os.makedirs(time_delay_plot_save_path) if chain_length_eval or plot_ts: for i in range(len(result['chain_name'])): chain = result['chain_name'][i] pred_time_delay = result['pred_all_time_delay'][i] gt_time_delay = result['gt_all_time_delay'][i] if plot_ts: plot_time_delay_ts_for_one_chain(chain, time_delay_plot_save_path, pred_time_delay, gt_time_delay, sim_start, sim_end) if chain_length_eval: length_mae.append( abs(len(pred_time_delay) - len(gt_time_delay))) if len(gt_time_delay) == 0: length_stat["gt_chain_0"] += 1 if len(gt_time_delay) == 1: length_stat["gt_chain_1"] += 1 if len(pred_time_delay) == len(gt_time_delay): length_stat["Same_as_gt"] += 1 if abs(len(pred_time_delay) - len(gt_time_delay)) < 10 and ( abs(len(pred_time_delay) - len(gt_time_delay)) >= 1): length_stat["diff_1_to_10"] += 1 if abs(len(pred_time_delay) - len(gt_time_delay)) < 100 and ( abs(len(pred_time_delay) - len(gt_time_delay)) >= 10): length_stat["diff_10_to_100"] += 1 if abs(len(pred_time_delay) - len(gt_time_delay)) >= 100: length_stat["diff_100+"] += 1 if chain_length_eval: length_mae = np.mean(length_mae) if chain_length_eval: print_and_log(logger, "====================================") print_and_log(logger, "chain length evaluation:") print_and_log(logger, "MAE: {}".format(round(length_mae, 4))) print_and_log( logger, "Count of number of simulated " "chains: {}".format(len(result['chain_name']))) print_and_log( logger, "Count of number of chains whose " "ground truth length is 0: {}".format(length_stat["gt_chain_0"])) print_and_log( logger, "Count of number of chains whose " "ground truth length is 1: {}".format(length_stat["gt_chain_1"])) if 'chains_applied_keep_pred' in result: print_and_log( logger, "Count of number of predicted chains that " "length needed threshold to be applied: {}, " "percentage: {} ".format( length_stat["applied_threshold"], round( length_stat["applied_threshold"] / len(result['chain_name']), 4))) print_and_log( logger, "Count of number of predicted " "chains that has " "same length as ground truth" ": {}, percentage: {}".format( length_stat["Same_as_gt"], round(length_stat["Same_as_gt"] / len(result['chain_name']), 4))) print_and_log( logger, "Count of number of predicted chains that " "length difference is 1 to 10: {}," "percentage: {}".format( length_stat["diff_1_to_10"], round(length_stat["diff_1_to_10"] / len(result['chain_name']), 4))) print_and_log( logger, "Count of number of predicted chains that " "length difference is 10 to 100: {}, " "percentage: {}".format( length_stat["diff_10_to_100"], round( length_stat["diff_10_to_100"] / len(result['chain_name']), 4))) print_and_log( logger, "Count of number of predicted chains that " "length difference is 100 and above: {}, " "percentage: {}".format( length_stat["diff_100+"], round(length_stat["diff_100+"] / len(result['chain_name']), 4))) return result
def user_cluster_nlg_eval(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "user cluster average bleu scores:") # print_and_log(logger, "Please install nlg-eval package!\n" # "Reference: https://github.com/Maluuba/nlg-eval") # print_and_log(logger, "After installing, please change the package " # "__init__.py file (contact: [email protected]).") sys.path.append(config['nlgeval_repo_dir']) from nlgeval import compute_individual_metrics # avg bleu avg_bleu = dict() avg_bleu = dict() avg_bleu['Bleu_1'] = list() avg_bleu['Bleu_2'] = list() avg_bleu['Bleu_3'] = list() avg_bleu['Bleu_4'] = list() result['uc_bleu1'] = list() result['uc_bleu2'] = list() result['uc_bleu3'] = list() result['uc_bleu4'] = list() for i in range(len(result['chain_name'])): if len(result['gt_all_user_cluster'][i]) == 0: gt_chain = " ".join(['no_event_in_simperiod']) else: gt_chain = " ".join( [str(ele) for ele in result['gt_all_user_cluster'][i]]) if len(result['pred_all_user_cluster'][i]) == 0: hy_chain = " ".join(['no_event_in_simperiod']) else: hy_chain = " ".join( [str(ele) for ele in result['pred_all_user_cluster'][i]]) metrics_dict = compute_individual_metrics(gt_chain, hy_chain, no_overlap=(False, True), no_skipthoughts=True, no_glove=True) result['uc_bleu1'].append(metrics_dict['Bleu_1']) avg_bleu['Bleu_1'].append(metrics_dict['Bleu_1']) if len(result['gt_all_user_cluster'][i]) >= 2: # and ( # len(result['pred_all_user_cluster'][i]) >= 2 # ): result['uc_bleu2'].append(metrics_dict['Bleu_2']) avg_bleu['Bleu_2'].append(metrics_dict['Bleu_2']) else: result['uc_bleu2'].append('null') if len(result['gt_all_user_cluster'][i]) >= 3: # and ( # len(result['pred_all_user_cluster'][i]) # ): result['uc_bleu3'].append(metrics_dict['Bleu_3']) avg_bleu['Bleu_3'].append(metrics_dict['Bleu_3']) else: result['uc_bleu3'].append('null') if len(result['gt_all_user_cluster'][i]) >= 4: # and ( # len(result['pred_all_user_cluster'][i]) >= 4 # ): result['uc_bleu4'].append(metrics_dict['Bleu_4']) avg_bleu['Bleu_4'].append(metrics_dict['Bleu_4']) else: result['uc_bleu4'].append('null') for metric in avg_bleu: print_and_log( logger, "{}: {}".format(metric, round(np.average(avg_bleu[metric]), 4))) # print_and_log(logger, "{}: {}, calculated from {} values".format( # metric, round(np.average(avg_bleu[metric]), 4), # len(avg_bleu[metric]))) # pdb.set_trace() return result
def result_evaluation_given_gt(config, result_save_path, only_has_event=False): # load result pickle with open(os.path.join(result_save_path, 'result.pickle'), 'rb') as handle: result = pickle.load(handle) print('result.pickle loaded!') print("result.keys: {}\n\n".format(result.keys())) if only_has_event: result_new = dict() result_new['chain_name'] = list() result_new['pred_all_event_id'] = list() result_new['pred_all_event_type'] = list() result_new['pred_all_time_delay'] = list() result_new['pred_all_user_cluster'] = list() result_new['gt_all_event_id'] = list() result_new['gt_all_event_type'] = list() result_new['gt_all_time_delay'] = list() result_new['gt_all_user_cluster'] = list() for i in range(len(result['chain_name'])): if len(result['gt_all_event_id'][i]) != 0: result_new['chain_name'].append(result['chain_name'][i]) result_new['pred_all_event_id'].append( result['pred_all_event_id'][i]) result_new['pred_all_event_type'].append( result['pred_all_event_type'][i]) result_new['pred_all_time_delay'].append( result['pred_all_time_delay'][i]) result_new['pred_all_user_cluster'].append( result['pred_all_user_cluster'][i]) result_new['gt_all_event_id'].append( result['gt_all_event_id'][i]) result_new['gt_all_event_type'].append( result['gt_all_event_type'][i]) result_new['gt_all_time_delay'].append( result['gt_all_time_delay'][i]) result_new['gt_all_user_cluster'].append( result['gt_all_user_cluster'][i]) result = result_new # logger if only_has_event: logger = set_logger( os.path.join( result_save_path, 'evaluate_only_has_event_given_gt_' + dt.now().strftime("%Y-%m-%dT%H-%M-%SZ") + '.log')) else: logger = set_logger( os.path.join( result_save_path, 'evaluate_all_given_gt_' + dt.now().strftime("%Y-%m-%dT%H-%M-%SZ") + '.log')) print_and_log( logger, "Evaluation over {} simulated chains...".format( len(result['chain_name']))) # evaluation proceses if config['event_type_nlg_eval']: result = event_type_nlg_eval(config, logger, result) if config['event_type_map_eval']: result = event_type_map_eval_given_gt(config, logger, result) result = event_type_categorical_accuracy_eval_given_gt( config, logger, result) if config['event_type_percentage_eval']: result = event_type_percentage_eval(config, logger, result) if config['user_cluster_nlg_eval']: result = user_cluster_nlg_eval(config, logger, result) if config['user_cluster_map_eval']: result = user_cluster_map_eval_given_gt(config, logger, result) result = user_cluster_categorical_accuracy_eval_given_gt( config, logger, result) if config['user_cluster_percentage_eval']: result = user_cluster_percentage_eval(config, logger, result) if config['time_delay_overall_evaluation']: if not only_has_event: result = time_delay_overall_evaluation(config, logger, result, result_save_path, plot_ts=config['plot_ts'], chain_length_eval=False) else: result = time_delay_overall_evaluation(config, logger, result, result_save_path, plot_ts=False, chain_length_eval=False) write_result_to_file(config, result, logger) del logger return
def user_cluster_percentage_eval(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "user cluster distribution evaluation:") gt_class_list = [] pred_class_list = [] for i in range(len(result['chain_name'])): gt_class_list += result['gt_all_user_cluster'][i] pred_class_list += result['pred_all_user_cluster'][i] gt_class_list_counter = Counter(gt_class_list) pred_class_list_counter = Counter(pred_class_list) clusters = list(range(100 + 1)) counts_per_class = [] for i in range(len(clusters)): counts_per_class.append(gt_class_list_counter[i]) gt_distribution = cal_distribution(counts_per_class) counts_per_class = [] for i in range(len(clusters)): counts_per_class.append(pred_class_list_counter[i]) pred_distribution = cal_distribution(counts_per_class) print_and_log(logger, "!!!! ground truth distribution: ") for i in range(len(clusters)): print_and_log(logger, "{}: {}".format(i, round(gt_distribution[i], 4))) print_and_log(logger, "!!!! prediction distribution: ") for i in range(len(clusters)): print_and_log(logger, "{}: {}".format(i, round(pred_distribution[i], 4))) return result
def event_type_percentage_eval(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "event type distribution evaluation:") gt_class_list = [] pred_class_list = [] for i in range(len(result['chain_name'])): gt_class_list += result['gt_all_event_type'][i] pred_class_list += result['pred_all_event_type'][i] gt_class_list_counter = Counter(gt_class_list) pred_class_list_counter = Counter(pred_class_list) eventtype_2_id = dict() for key in config['eventtype_2_id']: eventtype_2_id[key] = config['eventtype_2_id'][key] - 1 id_2_eventtype = dict(zip(eventtype_2_id.values(), eventtype_2_id.keys())) # pdb.set_trace() counts_per_class = [] for i in range(len(id_2_eventtype)): et = id_2_eventtype[i] counts_per_class.append(gt_class_list_counter[et]) gt_distribution = cal_distribution(counts_per_class) counts_per_class = [] for i in range(len(id_2_eventtype)): et = id_2_eventtype[i] counts_per_class.append(pred_class_list_counter[et]) pred_distribution = cal_distribution(counts_per_class) print_and_log(logger, "!!!! ground truth distribution: ") for i in range(len(id_2_eventtype)): et = id_2_eventtype[i] print_and_log(logger, "{}: {}".format(et, round(gt_distribution[i], 4))) print_and_log(logger, "!!!! prediction distribution: ") for i in range(len(id_2_eventtype)): et = id_2_eventtype[i] print_and_log(logger, "{}: {}".format(et, round(pred_distribution[i], 4))) return result
def testset_creation_given_gt(config, GT_avail=True): # not implemented! create_dataset_start = time.time() logger = set_logger( os.path.join( config['exp_save_dir'], 'testset_creation_given_gt_' + dt.now().strftime("%Y-%m-%dT%H-%M-%SZ") + '.log')) print_and_log(logger, '{}'.format('testset creation given gt...')) testset = dict() testset['X_test'] = list() testset['Y_test'] = list() testset['repo_list'] = list() testset['repo2sampleid'] = list() eventtype_2_id = config['eventtype_2_id'] # id_2_eventtype = dict(zip(eventtype_2_id.values(), # eventtype_2_id.keys())) train_start = utc_timestamp(config['train_period']['start']) sim_start = utc_timestamp(config['sim_period']['start']) sim_end = utc_timestamp(config['sim_period']['end']) with open( os.path.join(config['exp_save_dir'], "dataset", 'unique_repo_train_vali.json'), 'r') as f: unique_repo_train_vali = json.load(f) repo_list = list(unique_repo_train_vali.keys()) testset['repo_list'] = repo_list with open(config['github_event_repos_path'], 'rb') as f: github_event_repos = pickle.load(f) github_event_repos_set = set( [repo[:22] + '-' + repo[23:] for repo in github_event_repos]) x_dim = config['x_dim'] if config['use_repo_embedding']: with open(config['embed_vec_path'], 'rb') as f: embed_vec_all = pickle.load(f) if config['use_repo_profile_features']: with open(config['repo_profile_feat_path'], 'r') as f: repo_profile_raw = json.load(f) repo_profile_feat = dict() for repo in repo_profile_raw: this_repo_profile_feat = [] this_repo_profile_feat.append( time_delay_normalization(repo_profile_raw[repo]['language'][0], '10log10_xplus1')) this_repo_profile_feat += repo_profile_raw[repo]['user_type'] repo_profile_feat[repo] = this_repo_profile_feat if config['use_repo_idx_features']: with open(config['load_repo_path'], 'rb') as f: load_repo_train_vali = pickle.load(f) if config['use_repo_activity_features']: max_sum_act_feat = 0 max_sum_act_feat_vec = None with open(config['repo_activity_feat_path'], 'rb') as f: repo_act_feat = pickle.load(f) for repo in repo_act_feat: for cluster in repo_act_feat[repo]: this_repo_cluster_feat = [] for feat_name in repo_act_feat[repo][cluster]: this_repo_cluster_feat += repo_act_feat[repo][cluster][ feat_name] repo_act_feat[repo][cluster] = this_repo_cluster_feat if sum(this_repo_cluster_feat) > max_sum_act_feat: max_sum_act_feat = sum(this_repo_cluster_feat) max_sum_act_feat_vec = this_repo_cluster_feat print_and_log(logger, "max_sum_act_feat: {}".format(max_sum_act_feat)) print_and_log(logger, "max_sum_act_feat_vec: {}".format(max_sum_act_feat_vec)) if config['use_user_profile_features']: with open(config['user_profile_feat_path'], 'r') as f: user_profile_feat = json.load(f) if config['use_user_activity_features']: with open(config['user_activity_feat_path'], 'r') as f: user_act_feat = json.load(f) if config['use_user_cluster_one_hot']: with open(config['user_cluster_path'], 'r') as f: user_clusters = json.load(f) empty_no_event_user_cluster = max(user_clusters.values()) + 1 user_clusters[config['empty_user']] = empty_no_event_user_cluster user_clusters['no_event_user'] = empty_no_event_user_cluster if config['use_user_cluster_minmax']: with open(config['user_cluster_path'], 'r') as f: user_clusters = json.load(f) max_cluster_id = max(user_clusters.values()) min_cluster_id = min(user_clusters.values()) empty_no_event_user_cluster = max_cluster_id + 1 max_cluster_id = empty_no_event_user_cluster max_minus_min = max_cluster_id - min_cluster_id if min_cluster_id != 0: print("min cluster id is not 0! Need to examine code!") pdb.set_trace() user_clusters[config['empty_user']] = empty_no_event_user_cluster user_clusters['no_event_user'] = empty_no_event_user_cluster if config['use_cluster_profile_features']: max_sum_profile_feat = 0 max_sum_profile_feat_vec = None with open(config['cluster_profile_feat_path'], 'rb') as f: cluster_profile_feat = pickle.load(f) for cluster in cluster_profile_feat: this_cluster_feat = [] for feat_name in cluster_profile_feat[cluster]: # if feat_name == "geolocation" or feat_name == "user_type": # continue this_cluster_feat += cluster_profile_feat[cluster][feat_name] cluster_profile_feat[cluster] = this_cluster_feat if sum(this_cluster_feat) > max_sum_profile_feat: max_sum_profile_feat = sum(this_cluster_feat) max_sum_profile_feat_vec = this_cluster_feat cluster_profile_feat[empty_no_event_user_cluster] = [ 0 ] * config['dim_cluster_profile_features'] print_and_log(logger, "max_sum_profile_feat: {}".format(max_sum_profile_feat)) print_and_log( logger, "max_sum_profile_feat_vec: {}".format(max_sum_profile_feat_vec)) if config['use_cluster_activity_features']: max_sum_act_feat = 0 max_sum_act_feat_vec = None with open(config['cluster_activity_feat_path'], 'rb') as f: cluster_act_feat = pickle.load(f) for cluster in cluster_act_feat: for repo in cluster_act_feat[cluster]: this_cluster_repo_feat = [] for feat_name in cluster_act_feat[cluster][repo]: this_cluster_repo_feat += cluster_act_feat[cluster][repo][ feat_name] cluster_act_feat[cluster][repo] = this_cluster_repo_feat if sum(this_cluster_repo_feat) > max_sum_act_feat: max_sum_act_feat = sum(this_cluster_repo_feat) max_sum_act_feat_vec = this_cluster_repo_feat print_and_log(logger, "max_sum_act_feat: {}".format(max_sum_act_feat)) print_and_log(logger, "max_sum_act_feat_vec: {}".format(max_sum_act_feat_vec)) print_and_log(logger, "x_dim: {}".format(x_dim)) user_has_no_cluster = set() sample_id = 0 repo_y_et = [] repo_y_td = [] repo_y_uc = [] print_and_log(logger, "gather testing data...") for repo in repo_list: # for each cascade chain if (repo in config['repos_to_ignore']) or (repo in github_event_repos_set): # it is a event repo or repo should be ignore continue repo_X = [] repo2sampleid = [] one_chain_pd = load_jsongz_2_dataframe( os.path.join(config['cascade_dir'], repo + '.json.gz')) one_chain_pd = one_chain_pd.loc[ (one_chain_pd['nodeTime'] >= train_start) & (one_chain_pd['nodeTime'] <= sim_end)] one_chain_pd = one_chain_pd.sort_values(by=['nodeTime']) one_chain_event = [] one_chain_time = [] one_chain_user = [] # padding event for i in range(config['window_size']): one_chain_event.append(config['empty_event_type']) one_chain_time.append(config['empty_time_delay']) one_chain_user.append(config['empty_user']) # <soc> one_chain_event.append(eventtype_2_id['<soc>']) one_chain_time.append(config['empty_time_delay']) one_chain_user.append(config['empty_user']) # event sequence one_chain_event += [ eventtype_2_id[event] for event in one_chain_pd['actionType'] ] one_chain_time += [time for time in one_chain_pd['nodeTime']] one_chain_user += [user for user in one_chain_pd['nodeUserID']] (one_chain_event_new, one_chain_time_new, one_chain_user_new) = \ insert_no_event_for_a_chain_new(config, one_chain_event, one_chain_time, one_chain_user, sim_end+1) # if one_chain_event_new != one_chain_event: # pdb.set_trace() one_chain_event = one_chain_event_new one_chain_time = one_chain_time_new one_chain_user = one_chain_user_new # calculate time delay sequence one_chain_time_delay = [] for i in range(len(one_chain_time)): if (one_chain_event[i] == config['empty_event_type'] or one_chain_event[i] == eventtype_2_id['<soc>']): one_chain_time_delay.append(config['empty_time_delay']) elif one_chain_event[i - 1] == eventtype_2_id['<soc>']: one_chain_time_delay.append(config['empty_time_delay']) else: time_delay = get_time_delay(one_chain_time[i - 1], one_chain_time[i], 'float')[1] if config['time_delay_normalization_func'] is not None: time_delay = time_delay_normalization( time_delay, config['time_delay_normalization_func']) one_chain_time_delay.append(time_delay) # for each event sample in simulation period for i in range(config['window_size'], len(one_chain_event)): time_sample_outputevent = one_chain_time[i] event_sample_outputevent = one_chain_event[i] # if time_sample_outputevent in simulation period: # add this sample to testset if event_sample_outputevent == config['empty_event_type'] or ( event_sample_outputevent == eventtype_2_id['<soc>']): continue if one_chain_event[i - 1] == eventtype_2_id['<soc>']: continue if not ((time_sample_outputevent >= sim_start) and (time_sample_outputevent <= sim_end)): continue input_event_type = \ one_chain_event[i-config['window_size']:i] # input_time = one_chain_time[i-config['window_size']:i] input_time_delay = \ one_chain_time_delay[i-config['window_size']:i] input_user = one_chain_user[i - config['window_size']:i] input_cluster = [] for user in input_user: try: input_cluster.append(user_clusters[user]) except KeyError: user_has_no_cluster.add(user) input_cluster.append(user_clusters['no_event_user']) output_event_type = one_chain_event[i] output_time_delay = one_chain_time_delay[i] output_user = one_chain_user[i] try: output_cluster = user_clusters[output_user] except KeyError: user_has_no_cluster.add(output_user) output_cluster = user_clusters['no_event_user'] # initialize input vector, and output vector for this sample x_vec = [] # load repo embeding vector if config['use_repo_embedding']: try: embed_vec = np.array(embed_vec_all[repo[:22] + '/' + repo[23:]]) except KeyError: print_and_log( logger, "Could not find " "embedding vector for {}!".format(repo[:22] + '/' + repo[23:])) pdb.set_trace() # input feature vector for j in range(config['window_size']): # for each event node x_j = [] if config['use_repo_embedding']: x_j += list(embed_vec) if config['use_repo_profile_features']: try: x_j += repo_profile_feat[repo] except KeyError: x_j += [0] * config['dim_repo_profile_features'] if config['use_repo_idx_features']: try: x_j += load_repo_train_vali[repo] except KeyError: x_j += [0] if config['use_repo_activity_features']: if input_cluster[j] == empty_no_event_user_cluster: x_j += [0] * config['dim_repo_activity_features'] else: try: repo_thiscluster_act_feat = repo_act_feat[repo][ input_cluster[j]] repo_allcluster_act_feat = repo_act_feat[repo][ 'all_cluster'] x_j += repo_thiscluster_act_feat x_j += repo_allcluster_act_feat except KeyError: x_j += [0] * config['dim_repo_activity_features'] if config['use_user_profile_features']: if input_user[j] == config['empty_user'] or ( input_user[j] == 'no_event_user'): x_j += [0] * config['dim_user_profile_features'] else: try: x_j += user_profile_feat[input_user[j]] except KeyError: x_j += [0] * config['dim_user_profile_features'] if config['use_user_activity_features']: if input_user[j] == config['empty_user'] or ( input_user[j] == 'no_event_user'): x_j += [0] * config['dim_user_activity_features'] else: try: thisrepo_feat = \ user_act_feat[input_user[j]][repo] except KeyError: # this user-repo no event in training period thisrepo_feat = \ [0] * int(config[ 'dim_user_activity_features']/2) allrepo_feat = \ user_act_feat[input_user[j]]['all'] x_j += thisrepo_feat + allrepo_feat if config['use_event_type_one_hot']: event_type_one_hot = \ [0] * len(config['eventtype_2_id']) if input_event_type[j] != config['empty_event_type']: event_type_one_hot[input_event_type[j] - 1] = 1 x_j += event_type_one_hot if config['use_time_delay_features']: x_j += [input_time_delay[j]] if config['use_user_cluster_one_hot']: user_cluster_one_hot = \ [0] * config['dim_user_cluster_one_hot'] user_cluster_one_hot[input_cluster[j]] = 1 x_j += user_cluster_one_hot if config['use_user_cluster_minmax']: use_user_cluster_minmax = (input_cluster[j] / max_minus_min) x_j += [use_user_cluster_minmax] if config['use_cluster_profile_features']: this_cluster_profile_feat = cluster_profile_feat[ input_cluster[j]] x_j += this_cluster_profile_feat if config['use_cluster_activity_features']: if input_cluster[j] == empty_no_event_user_cluster: x_j += [0] * config['dim_cluster_activity_features'] else: try: cluster_thisrepo_act_feat = cluster_act_feat[ input_cluster[j]][repo] cluster_allrepo_act_feat = cluster_act_feat[ input_cluster[j]]['all_repo'] x_j += cluster_thisrepo_act_feat x_j += cluster_allrepo_act_feat except KeyError: x_j += [0 ] * config['dim_cluster_activity_features'] if len(x_j) != x_dim: print("len(x_j) != x_dim") pdb.set_trace() x_vec.append(x_j) if len(x_vec) != config['window_size']: print("len(x_vec) != config['window_size']") pdb.set_trace() repo_X.append(x_vec) # output vec repo_y_et.append([output_event_type]) repo_y_td.append([output_time_delay]) repo_y_uc.append([output_cluster]) repo2sampleid.append(sample_id) sample_id += 1 # pdb.set_trace() # finish gathering test data for this repo testset['X_test'] += repo_X testset['repo2sampleid'].append(repo2sampleid) # pdb.set_trace() testset['X_test'] = np.array(testset['X_test']) testset['Y_test_et'] = np.array(repo_y_et) testset['Y_test_td'] = np.array(repo_y_td) testset['Y_test_uc'] = np.array(repo_y_uc) print_and_log(logger, "X_test.shape: {}".format(testset['X_test'].shape)) print_and_log(logger, "Y_test_et.shape: {}".format(testset['Y_test_et'].shape)) print_and_log(logger, "Y_test_td.shape: {}".format(testset['Y_test_td'].shape)) print_and_log(logger, "Y_test_uc.shape: {}".format(testset['Y_test_uc'].shape)) print_and_log(logger, "number of chains for testing: {}".format(len(repo_list))) print_and_log( logger, "could not find cluster for {} users.".format( len(user_has_no_cluster))) if len(user_has_no_cluster) > 0: with open( os.path.join(config['exp_save_dir'], "dataset", 'user_has_no_cluster_given_gt.pickle'), 'wb') as f: pickle.dump(user_has_no_cluster, f) print_and_log( logger, "the number of testing samples: {}".format(len(testset['X_test']))) with open( os.path.join(config['exp_save_dir'], "dataset", 'testset_given_gt.pickle'), 'wb') as f: pickle.dump(testset, f) print_and_log( logger, "{} took {} min".format("testset creation given gt", (time.time() - create_dataset_start) / 60)) # pdb.set_trace() return testset
def testset_creation(config, GT_avail=True): create_dataset_start = time.time() logger = set_logger( os.path.join( config['exp_save_dir'], 'testset_creation_' + dt.now().strftime("%Y-%m-%dT%H-%M-%SZ") + '.log')) print_and_log(logger, '{}'.format('testset creation...')) if not os.path.exists(os.path.join(config['exp_save_dir'], "dataset")): os.makedirs(os.path.join(config['exp_save_dir'], "dataset")) testset = dict() testset['X_test'] = [] testset['repo'] = [] testset['input_last_event_time'] = [] testset['gt_event_type'] = None testset['gt_event_id'] = None testset['gt_time_delay'] = None testset['gt_user'] = None testset['gt_user_cluster'] = None X_test = [] eventtype_2_id = config['eventtype_2_id'] id_2_eventtype = dict(zip(eventtype_2_id.values(), eventtype_2_id.keys())) sim_start = utc_timestamp(config['sim_period']['start']) sim_end = utc_timestamp(config['sim_period']['end']) with open(config['unique_repo_train_vali_path'], 'r') as f: unique_repo_train_vali = json.load(f) repo_list = list(unique_repo_train_vali.keys()) with open(config['github_event_repos_path'], 'rb') as f: github_event_repos = pickle.load(f) github_event_repos_set = set( [repo[:22] + '-' + repo[23:] for repo in github_event_repos]) with open(config['user_cluster_path'], 'r') as f: user_clusters = json.load(f) empty_no_event_user_cluster = max(user_clusters.values()) + 1 user_clusters[config['empty_user']] = empty_no_event_user_cluster user_clusters['no_event_user'] = empty_no_event_user_cluster unique_repo_test = dict() unique_user_test = dict() user_has_no_cluster = set() # gather simulation sample input print_and_log(logger, "gather simulation sample input...") for repo in repo_list: # for each cascade chain if (repo in config['repos_to_ignore']) or (repo in github_event_repos_set): # it is a event repo or repo should be ignore continue one_chain_pd = load_jsongz_2_dataframe( os.path.join(config['cascade_dir'], repo + '.json.gz')) # get all events before sim one_chain_pd = one_chain_pd.loc[one_chain_pd['nodeTime'] < sim_start] one_chain_pd = one_chain_pd.sort_values(by=['nodeTime']) one_chain_event = [] one_chain_time = [] one_chain_user = [] # padding event for i in range(config['window_size']): one_chain_event.append(config['empty_event_type']) one_chain_time.append(config['empty_time_delay']) one_chain_user.append(config['empty_user']) # <soc> one_chain_event.append(eventtype_2_id['<soc>']) one_chain_time.append(config['empty_time_delay']) one_chain_user.append(config['empty_user']) # event sequence one_chain_event += [ eventtype_2_id[event] for event in one_chain_pd['actionType'] ] one_chain_time += [time for time in one_chain_pd['nodeTime']] one_chain_user += [user for user in one_chain_pd['nodeUserID']] (one_chain_event_new, one_chain_time_new, one_chain_user_new) = \ insert_no_event_for_a_chain_new(config, one_chain_event, one_chain_time, one_chain_user, sim_start) # if one_chain_event_new != one_chain_event: # pdb.set_trace() one_chain_event = one_chain_event_new one_chain_time = one_chain_time_new one_chain_user = one_chain_user_new # calculate time delay sequence one_chain_time_delay = [] for i in range(len(one_chain_time)): if (one_chain_event[i] == config['empty_event_type'] or one_chain_event[i] == eventtype_2_id['<soc>']): one_chain_time_delay.append(config['empty_time_delay']) elif one_chain_event[i - 1] == eventtype_2_id['<soc>']: one_chain_time_delay.append(config['empty_time_delay']) else: time_delay = get_time_delay(one_chain_time[i - 1], one_chain_time[i], 'float')[1] if config['time_delay_normalization_func'] is not None: time_delay = time_delay_normalization( time_delay, config['time_delay_normalization_func']) one_chain_time_delay.append(time_delay) # input_last_event_time testset['input_last_event_time'].append(one_chain_time[-1]) testset['repo'].append(repo) input_event_type = one_chain_event[-config['window_size']:] input_time_delay = one_chain_time_delay[-config['window_size']:] input_user = one_chain_user[i - config['window_size']:i] input_cluster = [] for user in input_user: try: input_cluster.append(user_clusters[user]) except KeyError: user_has_no_cluster.add(user) input_cluster.append(user_clusters['no_event_user']) # input feature vector if len(input_event_type) < config['window_size']: print("len(input_event_type) < config['window_size']") pdb.set_trace() # initialize input for this sample hightd_normalized = time_delay_normalization( 720, config['time_delay_normalization_func']) lowtd_normalized = time_delay_normalization( 0.001, config['time_delay_normalization_func']) td_normalized2 = time_delay_normalization( 0.1, config['time_delay_normalization_func']) last_et = config['eventtype_2_id']['no_event_for_1month'] last_td = hightd_normalized last_uc = user_clusters['no_event_user'] x_vec = [last_et, last_td, last_uc] # if repo == "75Q5j4D5taKq5AlL--ZIFg-WKcY0zAnxfYZx6laSNH2UA": # pdb.set_trace() # if repo == "2YzVcEU5XvJXobTU6swknA-uD2kFk9smPOUWFVcpo1bXg": # pdb.set_trace() X_test.append(x_vec) testset['X_test'] = X_test print_and_log(logger, "X_test length: {}".format(len(testset['X_test']))) # gather simulation sample output if GT_avail: print_and_log( logger, "ground truth available. \n" "gather simulation sample output...") testset['gt_event_type'] = [] testset['gt_time_delay'] = [] testset['gt_event_id'] = [] testset['gt_user'] = [] testset['gt_user_cluster'] = [] # for each gathered simulation sample input for i in range(len(testset['repo'])): repo_this_sample = testset['repo'][i] one_chain_pd = load_jsongz_2_dataframe( os.path.join(config['cascade_dir'], repo_this_sample + '.json.gz')) # get events during sim one_chain_pd = one_chain_pd.loc[ (one_chain_pd['nodeTime'] >= sim_start) & (one_chain_pd['nodeTime'] <= sim_end)] one_chain_pd = one_chain_pd.sort_values(by=['nodeTime']) one_chain_event_id = [] one_chain_time = [] one_chain_user = [] unique_repo_test[repo_this_sample] = [] if len(one_chain_pd) == 0: # this repo has no events in testing period # need to insert no event till end of testing input_last_event_time_this_chain = testset[ 'input_last_event_time'][i] (one_chain_event_new, one_chain_time_new, one_chain_user_new ) = insert_no_event_for_a_GTchain_who_has_no_event_at_all( config, one_chain_event_id, one_chain_time, one_chain_user, input_last_event_time_this_chain, sim_end) else: # this sample chain has events in testing period for user in one_chain_pd['nodeUserID']: unique_user_test[user] = [] one_chain_event_id += [ eventtype_2_id[event] for event in one_chain_pd['actionType'] ] one_chain_time += [time for time in one_chain_pd['nodeTime']] one_chain_user += [user for user in one_chain_pd['nodeUserID']] input_last_event_time_this_chain = testset[ 'input_last_event_time'][i] if min(one_chain_time) < sim_start: print("min(one_chain_time) < sim_start") pdb.set_trace() (one_chain_event_new, one_chain_time_new, one_chain_user_new) = \ insert_no_event_for_a_sim_GTchain( config, one_chain_event_id, one_chain_time, one_chain_user, input_last_event_time_this_chain, sim_end ) # if one_chain_event_new != one_chain_event: # pdb.set_trace() one_chain_event_id = one_chain_event_new one_chain_time = one_chain_time_new one_chain_user = one_chain_user_new one_chain_event_type = [ id_2_eventtype[event] for event in one_chain_event_id ] one_chain_user_cluster = [] # no mather one-hot or normalized, here need true cluster for user in one_chain_user: try: one_chain_user_cluster.append(user_clusters[user]) except KeyError: user_has_no_cluster.add(user) one_chain_user_cluster.append( user_clusters['no_event_user']) # calculate time delay sequence one_chain_time_delay = [] if len(one_chain_time) > 0: time_delay = get_time_delay(input_last_event_time_this_chain, one_chain_time[0], 'float')[1] # NO NEED TO DO normalization FOR GT !!! # if config['time_delay_normalization_func'] is not None: # time_delay = time_delay_normalization( # time_delay, # config['time_delay_normalization_func']) one_chain_time_delay.append(time_delay) for j in range(1, len(one_chain_time)): time_delay = get_time_delay(one_chain_time[j - 1], one_chain_time[j], 'float')[1] # NO NEED TO DO normalization FOR GT !!! # if config[ # 'time_delay_normalization_func'] is not None: # time_delay = time_delay_normalization( # time_delay, # config['time_delay_normalization_func']) one_chain_time_delay.append(time_delay) testset['gt_event_id'].append(one_chain_event_id) testset['gt_event_type'].append(one_chain_event_type) testset['gt_time_delay'].append(one_chain_time_delay) testset['gt_user'].append(one_chain_user) testset['gt_user_cluster'].append(one_chain_user_cluster) # pdb.set_trace() print_and_log( logger, "could not find cluster for {} users.".format( len(user_has_no_cluster))) # check num of 0.0 hours in the ground truth output gt_zero_hours_count = 0 gt_hours_count = 0 for i in range(len(testset['gt_time_delay'])): for j in range(len(testset['gt_time_delay'][i])): time_delay_hour = testset['gt_time_delay'][i][j] gt_hours_count += 1 if time_delay_hour == 0: gt_zero_hours_count += 1 print_and_log( logger, "Out of {} ground truth time delay values that the model " "needs to predict, {} of them are 0.0 time delay hour.".format( gt_hours_count, round(gt_zero_hours_count / gt_hours_count, 2))) # save testset print_and_log(logger, "save testset ...") testset_save_path = os.path.join(config['exp_save_dir'], "dataset", 'testset.pickle') with open(testset_save_path, 'wb') as handle: pickle.dump(testset, handle, protocol=pickle.HIGHEST_PROTOCOL) print_and_log( logger, "testset.pickle save in {}".format( os.path.join(config['exp_save_dir'], "dataset"))) with open( os.path.join(config['exp_save_dir'], "dataset", 'unique_repo_test.json'), 'w') as f: json.dump(unique_repo_test, f) with open( os.path.join(config['exp_save_dir'], "dataset", 'unique_user_test.json'), 'w') as f: json.dump(unique_user_test, f) print_and_log( logger, "the number of unique repos in " "testing samples: {}".format(len(unique_repo_test))) print_and_log( logger, "the number of unique users in " "testing samples: {}".format(len(unique_user_test))) print_and_log( logger, "{} took {}".format("testset creation", time.time() - create_dataset_start)) # pdb.set_trace() return testset