def event_type_map_eval_ml_metrics(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "event type MAP evaluation:") print_and_log( logger, "event type MAP: {}".format( ml_metrics.mapk(result['gt_all_event_id'], result['pred_all_event_id']))) k_list = [] for i in range(len(result['chain_name'])): k_list.append(len(result['gt_all_event_type'][i])) k_list = sorted(list(set(k_list))) k_list.remove(0) print_and_log(logger, "all possible k: {}".format(k_list)) for k in k_list: map_at_k = ml_metrics.mapk(result['gt_all_event_id'], result['pred_all_event_id'], k) print_and_log(logger, "event type MAP@{}: {}".format(int(k), map_at_k)) return result
def event_type_map_eval(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "event type MAP evaluation:") result['et_ap'] = list() for a, p in zip(result['gt_all_event_id'], result['pred_all_event_id']): AP = compute_AP(a, p) result['et_ap'].append(AP) map_re = np.mean(result['et_ap']) result['et_map'] = map_re print_and_log(logger, "event type MAP: {}".format(round(map_re, 4))) return result
def event_type_map_eval_given_gt(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "event type MAP evaluation:") result['et_ap'] = list() for a, p in zip(result['gt_all_event_id'], result['pred_all_event_id']): AP = precision_score(a, p, average='macro') result['et_ap'].append(AP) map_re = np.mean(result['et_ap']) result['et_map'] = map_re print_and_log(logger, "event type MAP: {}".format(round(map_re, 4))) return result
def user_cluster_map_eval(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "user cluster MAP evaluation:") result['uc_ap'] = list() for a, p in zip(result['gt_all_user_cluster'], result['pred_all_user_cluster']): AP = compute_AP(a, p) result['uc_ap'].append(AP) map_re = np.mean(result['uc_ap']) result['uc_map'] = map_re print_and_log(logger, "user cluster MAP: {}".format(round(map_re, 4))) return result
def event_type_categorical_accuracy_eval_given_gt(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "event type categorical accuracy evaluation:") y_true = [] y_pred = [] for a, p in zip(result['gt_all_event_id'], result['pred_all_event_id']): y_true += a y_pred += p result['et_cate'] = accuracy_score(y_true, y_pred) print_and_log( logger, "event type categorical accuracy: {}".format( round(result['et_cate'], 4))) return result
def user_cluster_categorical_accuracy_eval_given_gt(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "user cluster categorical accuracy evaluation:") y_true = [] y_pred = [] for a, p in zip(result['gt_all_user_cluster'], result['pred_all_user_cluster']): y_true += a y_pred += p result['uc_cate'] = accuracy_score(y_true, y_pred) print_and_log( logger, "user cluster categorical accuracy: {}".format( round(result['uc_cate'], 4))) return result
def train_models(config): # obtain x_dim x_dim = config['x_dim'] if not os.path.exists(os.path.join( config['exp_save_dir'], "models", "vis")): os.makedirs(os.path.join( config['exp_save_dir'], "models", 'vis')) logger = set_logger(os.path.join(config['exp_save_dir'], "models", 'train_model_architecture_' + dt.now().strftime("%Y-%m-%dT%H-%M-%SZ") + '.log')) print_and_log(logger, "x_dim: {}".format(x_dim)) # LTSM model architecture input_sequences = Input(shape=(config['window_size'], x_dim), name='input_sequences') lstm_1 = LSTM(250, return_sequences=True, name="lstm_1")( input_sequences) lstm_2 = LSTM(150, name="lstm_2")(lstm_1) # branching event type et_1 = Dense(128, activation='relu', name="et_1")(lstm_2) et_2 = Dropout(0.5, name="et_2")(et_1) et_3 = Dense(64, activation='relu', name="et_3")(et_2) et_4 = Dropout(0.5, name="et_4")(et_3) event_type_output = Dense(len(config['eventtype_2_id']), activation='softmax', name="event_type_output")(et_4) # branching time delay td_1 = Dense(128, activation='relu', name="td_1")(lstm_2) td_2 = Dropout(0.5, name="td_2")(td_1) td_3 = Dense(64, activation='relu', name="td_3")(td_2) td_4 = Dropout(0.5, name="td_4")(td_3) time_delay_output = Dense(1, activation='linear', name="time_delay_output")(td_4) # branching user cluster uc_1 = Dense(128, activation='relu', name="uc_1")(lstm_2) uc_2 = Dropout(0.5, name="uc_2")(uc_1) uc_3 = Dense(64, activation='relu', name="uc_3")(uc_2) uc_4 = Dropout(0.5, name="uc_4")(uc_3) user_cluster_output = Dense(config['dim_user_cluster_one_hot'], activation='softmax', name="user_cluster_output")(uc_4) # model model = Model(inputs=input_sequences, outputs=[event_type_output, time_delay_output, user_cluster_output]) model.summary(print_fn=logger.info) print(model.summary()) model = multi_gpu_model(model, gpus=config['num_gpu']) model.compile(optimizer='adam', loss=[binary_crossentropy, logcosh, binary_crossentropy], loss_weights=config['loss_weights'], metrics={'event_type_output': ( metrics.categorical_accuracy), 'time_delay_output': ( losses.mean_squared_error), 'user_cluster_output': ( metrics.categorical_accuracy) }) # get partition and labels with open(os.path.join(config['exp_save_dir'], "dataset", 'partition.json'), 'r') as f: partition = json.load(f) with open(os.path.join(config['exp_save_dir'], "dataset", 'labels.json'), 'r') as f: labels = json.load(f) # train val generator training_generator = DataGenerator_ET_TD_UC_one_hot( os.path.join(config['exp_save_dir'], "dataset"), partition['train'], labels, batch_size=config['batch_size'], dim=x_dim, window_size=config['window_size'], et_classes=len(config['eventtype_2_id']), uc_classes=config['dim_user_cluster_one_hot'], shuffle=config['generator_shuffle']) validation_generator = DataGenerator_ET_TD_UC_one_hot( os.path.join(config['exp_save_dir'], "dataset"), partition['validation'], labels, batch_size=config['batch_size'], dim=x_dim, window_size=config['window_size'], et_classes=len(config['eventtype_2_id']), uc_classes=config['dim_user_cluster_one_hot'], shuffle=config['generator_shuffle']) # callback if not os.path.exists(os.path.join( config['exp_save_dir'], "models", "vis")): os.makedirs(os.path.join( config['exp_save_dir'], "models", 'vis')) TensorBoard_callback = TensorBoard( log_dir=os.path.join(config['exp_save_dir'], "models", 'vis'), histogram_freq=0, write_graph=True, write_images=True, write_grads=False) """ Keras TensorBoard Reference: https://keras.io/callbacks/#tensorboard launch: tensorboard --logdir=/full_path_to_your_logs """ callbacks = [EarlyStopping(monitor='val_loss', patience=config['patience']), ModelCheckpoint( filepath=os.path.join(config['exp_save_dir'], "models", 'model-{epoch:02d}.hdf5'), monitor='val_loss', verbose=2), TensorBoard_callback] # save train confg in case testing need it with open(os.path.join(config['exp_save_dir'], "models", 'train_config.pickle'), 'wb') as f: pickle.dump(config, f) print("{} saved!".format('train_config.pickle')) trainable_count = int( np.sum([K.count_params(p) for p in set(model.trainable_weights)])) non_trainable_count = int( np.sum([K.count_params(p) for p in set( model.non_trainable_weights)])) print('Total params: {:,}'.format( trainable_count + non_trainable_count)) print('Trainable params: {:,}'.format(trainable_count)) print('Non-trainable params: {:,}'.format(non_trainable_count)) history = model.fit_generator( generator=training_generator, epochs=config['num_epochs'], callbacks=callbacks, validation_data=validation_generator, use_multiprocessing=True, workers=config['multiprocessing_cpu'], shuffle=True) """ Whether to shuffle the order of the batches at the beginning of each epoch. Only used with instances of Sequence (keras.utils.Sequence). Has no effect when steps_per_epoch is not None. Basically, no effect here. https://stackoverflow.com/questions/49027174/ what-does-shuffle-do-in-fit-generator-in-keras """ model.save(os.path.join(config['exp_save_dir'], "models", 'model_final.hdf5')) print("model_final.hdf5 saved!") with open(os.path.join(config['exp_save_dir'], "models", 'history_final.pickle'), 'wb') as f: pickle.dump(history.history, f) print("history_final.pickle saved!") return
def trainset_valiset_creation(config): # preparation create_dataset_start = time.time() if not os.path.exists(os.path.join(config['exp_save_dir'], "dataset")): os.makedirs(os.path.join(config['exp_save_dir'], "dataset")) logger = set_logger(os.path.join(config['exp_save_dir'], 'trainset_valiset_creation_' + dt.now().strftime("%Y-%m-%dT%H-%M-%SZ") + '.log')) print_and_log(logger, '{}'.format( 'trainset and valiset creation...')) eventtype_2_id = config['eventtype_2_id'] # id_2_eventtype = dict(zip(eventtype_2_id.values(), # eventtype_2_id.keys())) with open(config['github_event_repos_path'], 'rb') as f: github_event_repos = pickle.load(f) github_event_repos_set = set( [repo[:22]+'-'+repo[23:] for repo in github_event_repos]) train_start = utc_timestamp(config['train_period']['start']) train_end = utc_timestamp(config['train_period']['end']) vali_start = utc_timestamp(config['vali_period']['start']) vali_end = utc_timestamp(config['vali_period']['end']) # read cascade file to get whole dataset print_and_log(logger, "read cascade files...") x_dim = config['x_dim'] if config['use_repo_embedding']: with open(config['embed_vec_path'], 'rb') as f: embed_vec_all = pickle.load(f) if config['use_repo_profile_features']: with open(config['repo_profile_feat_path'], 'r') as f: repo_profile_raw = json.load(f) repo_profile_feat = dict() for repo in repo_profile_raw: this_repo_profile_feat = [] this_repo_profile_feat.append( time_delay_normalization( repo_profile_raw[repo]['language'][0], config['time_delay_normalization_func'])) this_repo_profile_feat += repo_profile_raw[repo]['user_type'] repo_profile_feat[repo] = this_repo_profile_feat if config['use_repo_idx_features']: if config['load_repo_path'] == 'null': with open(os.path.join(config['root_dir'], 'data', 'unique_repo_train_vali.json'), 'r') as f: unique_repo_train_vali = json.load(f) unique_repo_list = list(unique_repo_train_vali.keys()) load_repo_train_vali = dict() for i in range(len(unique_repo_list)): repo = unique_repo_list[i] load_repo_train_vali[repo] = [ time_delay_normalization( i, config['time_delay_normalization_func'])] with open(os.path.join(config['root_dir'], 'data', 'load_repo_train_vali.pickle'), 'wb') as f: pickle.dump(load_repo_train_vali, f) else: with open(config['load_repo_path'], 'rb') as f: load_repo_train_vali = pickle.load(f) if config['use_repo_activity_features']: max_sum_act_feat = 0 max_sum_act_feat_vec = None with open(config['repo_activity_feat_path'], 'rb') as f: repo_act_feat = pickle.load(f) for repo in repo_act_feat: for cluster in repo_act_feat[repo]: this_repo_cluster_feat = [] for feat_name in repo_act_feat[repo][cluster]: this_repo_cluster_feat += repo_act_feat[ repo][cluster][feat_name] repo_act_feat[repo][cluster] = this_repo_cluster_feat if sum(this_repo_cluster_feat) > max_sum_act_feat: max_sum_act_feat = sum(this_repo_cluster_feat) max_sum_act_feat_vec = this_repo_cluster_feat print_and_log(logger, "max_sum_act_feat: {}".format( max_sum_act_feat)) print_and_log(logger, "max_sum_act_feat_vec: {}".format( max_sum_act_feat_vec)) if config['use_user_profile_features']: with open(config['user_profile_feat_path'], 'r') as f: user_profile_feat = json.load(f) if config['use_user_activity_features']: with open(config['user_activity_feat_path'], 'r') as f: user_act_feat = json.load(f) if config['use_user_cluster_one_hot']: with open(config['user_cluster_path'], 'r') as f: user_clusters = json.load(f) empty_no_event_user_cluster = max(user_clusters.values()) + 1 user_clusters[config['empty_user']] = empty_no_event_user_cluster user_clusters['no_event_user'] = empty_no_event_user_cluster if config['use_user_cluster_minmax']: with open(config['user_cluster_path'], 'r') as f: user_clusters = json.load(f) max_cluster_id = max(user_clusters.values()) min_cluster_id = min(user_clusters.values()) empty_no_event_user_cluster = max_cluster_id + 1 max_cluster_id = empty_no_event_user_cluster max_minus_min = max_cluster_id - min_cluster_id if min_cluster_id != 0: print("min cluster id is not 0! Need to examine code!") pdb.set_trace() user_clusters[config['empty_user']] = empty_no_event_user_cluster user_clusters['no_event_user'] = empty_no_event_user_cluster if config['use_cluster_profile_features']: max_sum_profile_feat = 0 max_sum_profile_feat_vec = None with open(config['cluster_profile_feat_path'], 'rb') as f: cluster_profile_feat = pickle.load(f) for cluster in cluster_profile_feat: this_cluster_feat = [] for feat_name in cluster_profile_feat[cluster]: # if feat_name == "geolocation" or feat_name == "user_type": # continue this_cluster_feat += cluster_profile_feat[cluster][feat_name] cluster_profile_feat[cluster] = this_cluster_feat if sum(this_cluster_feat) > max_sum_profile_feat: max_sum_profile_feat = sum(this_cluster_feat) max_sum_profile_feat_vec = this_cluster_feat cluster_profile_feat[empty_no_event_user_cluster] = [0] * config[ 'dim_cluster_profile_features'] print_and_log(logger, "max_sum_profile_feat: {}".format( max_sum_profile_feat)) print_and_log(logger, "max_sum_profile_feat_vec: {}".format( max_sum_profile_feat_vec)) if config['use_cluster_activity_features']: max_sum_act_feat = 0 max_sum_act_feat_vec = None with open(config['cluster_activity_feat_path'], 'rb') as f: cluster_act_feat = pickle.load(f) for cluster in cluster_act_feat: for repo in cluster_act_feat[cluster]: this_cluster_repo_feat = [] for feat_name in cluster_act_feat[cluster][repo]: this_cluster_repo_feat += cluster_act_feat[ cluster][repo][feat_name] cluster_act_feat[cluster][repo] = this_cluster_repo_feat if sum(this_cluster_repo_feat) > max_sum_act_feat: max_sum_act_feat = sum(this_cluster_repo_feat) max_sum_act_feat_vec = this_cluster_repo_feat print_and_log(logger, "max_sum_act_feat: {}".format( max_sum_act_feat)) print_and_log(logger, "max_sum_act_feat_vec: {}".format( max_sum_act_feat_vec)) print_and_log(logger, "x_dim: {}".format(x_dim)) # pdb.set_trace() partition = dict() partition['train'] = [] partition['validation'] = [] labels = dict() labels['event_type'] = dict() labels['time_delay'] = dict() labels['user_cluster'] = dict() repo2sample = dict() repo_list = [f[:-8] for f in os.listdir( config['cascade_dir']) if '.json.gz' in f] sample_id = 0 user_has_no_cluster = set() unique_repo_train_vali = {} unique_user_train_vali = {} # each_repo_chain_length = dict() # each_user_total_event_count = dict() # each_cluster_total_event_count = dict() # each_event_type_total_count = dict() for repo_idx in range(len(repo_list)): repo = repo_list[repo_idx] print('processing {}, {}/{}'.format( round(repo_idx/len(repo_list), 2), repo_idx, len(repo_list)), end='\r') if (repo in config['repos_to_ignore']) or ( repo in github_event_repos_set): # it is a event repo or repo should be ignore continue repo2sample[repo] = dict() repo2sample[repo]['train'] = list() repo2sample[repo]['validation'] = list() one_chain_pd = load_jsongz_2_dataframe( os.path.join(config['cascade_dir'], repo + '.json.gz')) if len(one_chain_pd.loc[ (one_chain_pd['nodeTime'] >= train_start) & (one_chain_pd['nodeTime'] <= train_end)]) == 0: # this repo has no events in training continue # if repo == 'Pg6sypDT02F199RR24XAGw-FNnptRvRMDUIs9HwCqbS6A': # sim_start = utc_timestamp(config['sim_period']['start']) # sim_end = utc_timestamp(config['sim_period']['end']) # # tem_a = one_chain_pd.loc[ # (one_chain_pd['nodeTime'] >= train_start) & # (one_chain_pd['nodeTime'] <= vali_end)] # tem_a_time = [] # tem_a_time += [time for time in tem_a['nodeTime']] # # tem_b = one_chain_pd.loc[ # (one_chain_pd['nodeTime'] >= sim_start) & # (one_chain_pd['nodeTime'] <= sim_end)] # tem_b_time = [] # tem_b_time += [time for time in tem_b['nodeTime']] # pdb.set_trace() ############################# # one_chain_event = [] # one_chain_user = [] # event sequence # one_chain_event += [event for event in one_chain_pd['actionType']] # one_chain_user += [user for user in one_chain_pd['nodeUserID']] # each_repo_chain_length[repo] = len(one_chain_event) # for user in one_chain_user: # try: # each_user_total_event_count[user] += 1 # except KeyError: # each_user_total_event_count[user] = 1 # for event in one_chain_event: # try: # each_event_type_total_count[event] += 1 # except KeyError: # each_event_type_total_count[event] = 1 # continue ############################# one_chain_pd = one_chain_pd.loc[ (one_chain_pd['nodeTime'] >= train_start) & (one_chain_pd['nodeTime'] <= vali_end)] one_chain_pd = one_chain_pd.sort_values(by=['nodeTime']) one_chain_event = [] one_chain_time = [] one_chain_user = [] # padding event for i in range(config['window_size']): one_chain_event.append(config['empty_event_type']) one_chain_time.append(config['empty_time_delay']) one_chain_user.append(config['empty_user']) # <soc> one_chain_event.append(eventtype_2_id['<soc>']) one_chain_time.append(config['empty_time_delay']) one_chain_user.append(config['empty_user']) # event sequence one_chain_event += [eventtype_2_id[ event] for event in one_chain_pd['actionType']] one_chain_time += [time for time in one_chain_pd['nodeTime']] one_chain_user += [user for user in one_chain_pd['nodeUserID']] (one_chain_event_new, one_chain_time_new, one_chain_user_new) = \ insert_no_event_for_a_chain_new(config, one_chain_event, one_chain_time, one_chain_user, vali_end+1) # if one_chain_event_new != one_chain_event: # pdb.set_trace() one_chain_event = one_chain_event_new one_chain_time = one_chain_time_new one_chain_user = one_chain_user_new """ one_chain_event = one_chain_event_new[21:] one_chain_time = one_chain_time_new[21:] one_chain_user = one_chain_user_new[21:] one_chain_cluster = [user_clusters[user] for user in one_chain_user] for cluster in one_chain_cluster: try: each_cluster_total_event_count[cluster] += 1 except KeyError: each_cluster_total_event_count[cluster] = 1 for event in one_chain_event: try: each_event_type_total_count[event] += 1 except KeyError: each_event_type_total_count[event] = 1 continue """ # calculate time delay sequence one_chain_time_delay = [] for i in range(len(one_chain_time)): if (one_chain_event[i] == config['empty_event_type'] or one_chain_event[i] == eventtype_2_id['<soc>']): one_chain_time_delay.append(config['empty_time_delay']) elif one_chain_event[i-1] == eventtype_2_id['<soc>']: one_chain_time_delay.append(config['empty_time_delay']) else: time_delay = get_time_delay(one_chain_time[i-1], one_chain_time[i], 'float')[1] if config['time_delay_normalization_func'] is not None: time_delay = time_delay_normalization( time_delay, config['time_delay_normalization_func']) one_chain_time_delay.append(time_delay) # get the unique repos users in the training cascades unique_repo_train_vali[repo] = [] for user in one_chain_pd['nodeUserID']: unique_user_train_vali[user] = [] # for each sample for i in range(config['window_size'], len(one_chain_event)): sample_id += 1 ID = 'id-' + str(sample_id) # print(ID) # pdb.set_trace() time_sample_outputevent = one_chain_time[i] event_sample_outputevent = one_chain_event[i] # if time_sample_outputevent in training period: # add this sample to trainset if event_sample_outputevent == config['empty_event_type'] or ( event_sample_outputevent == eventtype_2_id['<soc>']): continue if one_chain_event[i-1] == eventtype_2_id['<soc>']: continue if not ((time_sample_outputevent >= train_start) and ( time_sample_outputevent <= (vali_end))): print("should not happen") pdb.set_trace() input_event_type = \ one_chain_event[i-config['window_size']:i] input_time = one_chain_time[i-config['window_size']:i] input_time_delay = \ one_chain_time_delay[i-config['window_size']:i] input_user = one_chain_user[i-config['window_size']:i] input_cluster = [] for user in input_user: try: input_cluster.append(user_clusters[user]) except KeyError: user_has_no_cluster.add(user) input_cluster.append(user_clusters['no_event_user']) output_event_type = \ one_chain_event[i] output_time_delay = \ one_chain_time_delay[i] output_user = one_chain_user[i] try: output_cluster = user_clusters[output_user] except KeyError: user_has_no_cluster.add(output_user) output_cluster = user_clusters['no_event_user'] """ if (config['use_user_cluster_one_hot']) and ( not config['use_user_cluster_minmax']): try: output_cluster = user_clusters[output_user] except KeyError: user_has_no_cluster.add(output_user) output_cluster = user_clusters['no_event_user'] else: try: output_cluster = ( user_clusters[output_user] / max_minus_min) except KeyError: user_has_no_cluster.add(output_user) output_cluster = ( user_clusters[ 'no_event_user'] / max_minus_min) """ # initialize input vector, and output vector for this sample x_vec = [] # load repo embeding vector if config['use_repo_embedding']: try: embed_vec = np.array( embed_vec_all[repo[:22] + '/' + repo[23:]]) except KeyError: print_and_log(logger, "Could not find " "embedding vector for {}!".format( repo[:22] + '/' + repo[23:])) pdb.set_trace() # input feature vector for j in range(config['window_size']): # for each event node x_j = [] if config['use_repo_embedding']: x_j += list(embed_vec) if config['use_repo_profile_features']: try: x_j += repo_profile_feat[repo] except KeyError: x_j += [0] * config['dim_repo_profile_features'] if config['use_repo_idx_features']: try: x_j += load_repo_train_vali[repo] except KeyError: x_j += [0] if config['use_repo_activity_features']: if input_cluster[j] == empty_no_event_user_cluster: x_j += [0] * config[ 'dim_repo_activity_features'] else: try: repo_thiscluster_act_feat = repo_act_feat[ repo][input_cluster[j]] repo_allcluster_act_feat = repo_act_feat[ repo]['all_cluster'] x_j += repo_thiscluster_act_feat x_j += repo_allcluster_act_feat except KeyError: x_j += [0] * config[ 'dim_repo_activity_features'] if config['use_user_profile_features']: if input_user[j] == config['empty_user'] or ( input_user[j] == 'no_event_user'): x_j += [0] * config['dim_user_profile_features'] else: try: x_j += user_profile_feat[input_user[j]] except KeyError: x_j += [0] * config[ 'dim_user_profile_features'] if config['use_user_activity_features']: if input_user[j] == config['empty_user'] or ( input_user[j] == 'no_event_user'): x_j += [0] * config[ 'dim_user_activity_features'] else: try: thisrepo_feat = \ user_act_feat[input_user[j]][repo] except KeyError: # this user-repo no event in training period thisrepo_feat = \ [0] * int(config[ 'dim_user_activity_features']/2) allrepo_feat = \ user_act_feat[input_user[j]]['all'] x_j += thisrepo_feat + allrepo_feat if config['use_event_type_one_hot']: event_type_one_hot = \ [0] * len(config['eventtype_2_id']) if input_event_type[j] != config['empty_event_type']: event_type_one_hot[input_event_type[j]-1] = 1 x_j += event_type_one_hot if config['use_time_delay_features']: x_j += [input_time_delay[j]] if config['use_user_cluster_one_hot']: user_cluster_one_hot = \ [0] * config['dim_user_cluster_one_hot'] user_cluster_one_hot[input_cluster[j]] = 1 x_j += user_cluster_one_hot if config['use_user_cluster_minmax']: use_user_cluster_minmax = ( input_cluster[j] / max_minus_min) x_j += [use_user_cluster_minmax] if config['use_cluster_profile_features']: this_cluster_profile_feat = cluster_profile_feat[ input_cluster[j]] x_j += this_cluster_profile_feat if config['use_cluster_activity_features']: if input_cluster[j] == empty_no_event_user_cluster: x_j += [0] * config[ 'dim_cluster_activity_features'] else: try: cluster_thisrepo_act_feat = cluster_act_feat[ input_cluster[j]][repo] cluster_allrepo_act_feat = cluster_act_feat[ input_cluster[j]]['all_repo'] x_j += cluster_thisrepo_act_feat x_j += cluster_allrepo_act_feat except KeyError: # posibility one: # cluster only did first event for this repo # since no time delay exist # we didnot have actfeat for this clus-repo pair # posibility two: # cluster only did event for this repo in # validation period if not (input_time_delay[j] == -1 or ( time_sample_outputevent > (train_end+1))): print(j, input_time) pdb.set_trace() x_j += [0] * config[ 'dim_cluster_activity_features'] if len(x_j) != x_dim: print("len(x_j) != x_dim") pdb.set_trace() x_vec.append(x_j) if len(x_vec) != config['window_size']: print("len(x_vec) != config['window_size']") pdb.set_trace() if (time_sample_outputevent >= train_start) and ( time_sample_outputevent <= train_end): partition['train'].append(ID) labels['event_type'][ID] = output_event_type-1 labels['time_delay'][ID] = output_time_delay labels['user_cluster'][ID] = output_cluster np.save(os.path.join(config['exp_save_dir'], "dataset", ID + '.npy'), x_vec) repo2sample[repo]['train'].append(ID) elif (time_sample_outputevent >= vali_start) and ( time_sample_outputevent <= vali_end): partition['validation'].append(ID) labels['event_type'][ID] = output_event_type-1 labels['time_delay'][ID] = output_time_delay labels['user_cluster'][ID] = output_cluster np.save(os.path.join(config['exp_save_dir'], "dataset", ID + '.npy'), x_vec) repo2sample[repo]['validation'].append(ID) else: print_and_log(logger, "time_sample_outputevent not in " "training or validation period!") pdb.set_trace() # with open(os.path.join(config['root_dir'], "data", # 'each_repo_chain_length.pickle'), 'wb') as f: # pickle.dump(each_repo_chain_length, f) # with open(os.path.join(config['root_dir'], "data", # 'each_user_total_event_count.pickle'), 'wb') as f: # pickle.dump(each_user_total_event_count, f) # with open(os.path.join(config['root_dir'], "data", # 'each_event_type_total_count.pickle'), 'wb') as f: # pickle.dump(each_event_type_total_count, f) # pdb.set_trace() print_and_log(logger, "number of chains used for " "training and validation: {}".format( len(unique_repo_train_vali))) print_and_log(logger, "could not find cluster for {} users.".format( len(user_has_no_cluster))) # et_pos, et_neg, uc_pos, uc_neg """ number of events with that event type / the total number of events number of events without that event type / total number of events number of events with this user cluster / total number of events number of events without this user cluster / total number of events et_pos = [] et_neg = [] uc_pos = [] uc_neg = [] et_ids = sorted(list(id_2_eventtype.keys())) uc_ids = list(range(101)) for i in et_ids: et_pos.append(0) et_neg.append(0) for i in uc_ids: uc_pos.append(0) uc_neg.append(0) total_events = 0 for e in each_event_type_total_count: total_events += each_event_type_total_count[e] for i in et_ids: if i == 11: continue et_pos[i-1] = each_event_type_total_count[i]/total_events et_neg[i-1] = (total_events - each_event_type_total_count[ i])/total_events et_neg[10] = float(1) for i in uc_ids: uc_pos[i] = each_cluster_total_event_count[i]/total_events uc_neg[i] = (total_events - each_cluster_total_event_count[ i])/total_events with open(os.path.join(config['root_dir'], "data", 'et_pos.json'), 'w') as f: json.dump(et_pos, f) with open(os.path.join(config['root_dir'], "data", 'et_neg.json'), 'w') as f: json.dump(et_neg, f) with open(os.path.join(config['root_dir'], "data", 'uc_pos.json'), 'w') as f: json.dump(uc_pos, f) with open(os.path.join(config['root_dir'], "data", 'uc_neg.json'), 'w') as f: json.dump(uc_neg, f) pdb.set_trace() """ if len(user_has_no_cluster) > 0: with open(os.path.join(config['exp_save_dir'], "dataset", 'user_has_no_cluster.pickle'), 'wb') as f: pickle.dump(user_has_no_cluster, f) with open(os.path.join(config['exp_save_dir'], "dataset", 'partition.json'), 'w') as f: json.dump(partition, f) with open(os.path.join(config['exp_save_dir'], "dataset", 'repo2sample.pickle'), 'wb') as f: pickle.dump(repo2sample, f) with open(os.path.join(config['exp_save_dir'], "dataset", 'unique_repo_train_vali.json'), 'w') as f: json.dump(unique_repo_train_vali, f) with open(os.path.join(config['exp_save_dir'], "dataset", 'unique_user_train_vali.json'), 'w') as f: json.dump(unique_user_train_vali, f) df = pd.DataFrame(labels) df.to_json(os.path.join(config['exp_save_dir'], "dataset", 'labels.json')) print_and_log(logger, "the number of training samples: {}".format( len(partition['train']))) print_and_log(logger, "the number of validation samples: {}".format( len(partition['validation']))) print_and_log(logger, "the number of unique repos in training and " "validation samples: {}".format( len(unique_repo_train_vali))) print_and_log(logger, "the number of unique users in training and " "validation samples: {}".format( len(unique_user_train_vali))) print_and_log(logger, "{} took {} min".format( "trainset valiset creation", (time.time()-create_dataset_start)/60)) # pdb.set_trace() return (partition, labels)
def time_delay_overall_evaluation(config, logger, result, result_save_path, plot_ts=True, chain_length_eval=True): print_and_log(logger, "====================================") print_and_log(logger, "time delay evaluation:") # statistics pred_all = [] gt_all = [] avg_dtw = [] avg_mse = [] result["td_DTW"] = list() result["td_MSE"] = list() for i in range(len(result['chain_name'])): pred_time_delay = result['pred_all_time_delay'][i] gt_time_delay = result['gt_all_time_delay'][i] if len(pred_time_delay) == 0: pred_time_delay = [-1] if len(gt_time_delay) == 0: gt_time_delay = [-1] avg_dtw.append(fastdtw(gt_time_delay, pred_time_delay)[0]) result["td_DTW"].append(avg_dtw[-1]) if len(gt_time_delay) == len(pred_time_delay): avg_mse.append(mean_squared_error(gt_time_delay, pred_time_delay)) result["td_MSE"].append(avg_mse[-1]) else: result["td_MSE"].append('null') if len(result['pred_all_time_delay'][i]) != 0: pred_all += pred_time_delay if len(result['gt_all_time_delay'][i]) != 0: gt_all += gt_time_delay print_and_log(logger, "Average DTW: {}".format(round(np.mean(avg_dtw), 4))) if config['given_gt']: print_and_log(logger, "Average MSE: {}".format(np.mean(avg_mse))) print_and_log( logger, "MAX predicted: {}, ground truth: {}".format(round(max(pred_all), 4), round(max(gt_all), 4))) print_and_log( logger, "MIN predicted: {}, ground truth: {}".format(round(min(pred_all), 4), round(min(gt_all), 4))) print_and_log( logger, "MEAN predicted: {}, ground truth: {}".format( round(np.mean(pred_all), 4), round(np.mean(gt_all), 4))) print_and_log( logger, "STD predicted: {}, ground truth: {}".format( round(np.std(pred_all), 4), round(np.std(gt_all), 4))) # chain length evaluation if chain_length_eval: length_mae = [] length_stat = dict() length_stat["gt_chain_0"] = 0 length_stat["gt_chain_1"] = 0 length_stat["Same_as_gt"] = 0 length_stat["diff_1_to_10"] = 0 length_stat["diff_10_to_100"] = 0 length_stat["diff_100+"] = 0 if 'chains_applied_keep_pred' in result: length_stat["applied_threshold"] = len( result["chains_applied_keep_pred"]) sim_start = config['sim_period']['start'].split('T')[0] sim_end = config['sim_period']['end'].split('T')[0] if plot_ts: time_delay_plot_save_path = os.path.join(result_save_path, "time_delay_plot") if not os.path.exists(time_delay_plot_save_path): os.makedirs(time_delay_plot_save_path) if chain_length_eval or plot_ts: for i in range(len(result['chain_name'])): chain = result['chain_name'][i] pred_time_delay = result['pred_all_time_delay'][i] gt_time_delay = result['gt_all_time_delay'][i] if plot_ts: plot_time_delay_ts_for_one_chain(chain, time_delay_plot_save_path, pred_time_delay, gt_time_delay, sim_start, sim_end) if chain_length_eval: length_mae.append( abs(len(pred_time_delay) - len(gt_time_delay))) if len(gt_time_delay) == 0: length_stat["gt_chain_0"] += 1 if len(gt_time_delay) == 1: length_stat["gt_chain_1"] += 1 if len(pred_time_delay) == len(gt_time_delay): length_stat["Same_as_gt"] += 1 if abs(len(pred_time_delay) - len(gt_time_delay)) < 10 and ( abs(len(pred_time_delay) - len(gt_time_delay)) >= 1): length_stat["diff_1_to_10"] += 1 if abs(len(pred_time_delay) - len(gt_time_delay)) < 100 and ( abs(len(pred_time_delay) - len(gt_time_delay)) >= 10): length_stat["diff_10_to_100"] += 1 if abs(len(pred_time_delay) - len(gt_time_delay)) >= 100: length_stat["diff_100+"] += 1 if chain_length_eval: length_mae = np.mean(length_mae) if chain_length_eval: print_and_log(logger, "====================================") print_and_log(logger, "chain length evaluation:") print_and_log(logger, "MAE: {}".format(round(length_mae, 4))) print_and_log( logger, "Count of number of simulated " "chains: {}".format(len(result['chain_name']))) print_and_log( logger, "Count of number of chains whose " "ground truth length is 0: {}".format(length_stat["gt_chain_0"])) print_and_log( logger, "Count of number of chains whose " "ground truth length is 1: {}".format(length_stat["gt_chain_1"])) if 'chains_applied_keep_pred' in result: print_and_log( logger, "Count of number of predicted chains that " "length needed threshold to be applied: {}, " "percentage: {} ".format( length_stat["applied_threshold"], round( length_stat["applied_threshold"] / len(result['chain_name']), 4))) print_and_log( logger, "Count of number of predicted " "chains that has " "same length as ground truth" ": {}, percentage: {}".format( length_stat["Same_as_gt"], round(length_stat["Same_as_gt"] / len(result['chain_name']), 4))) print_and_log( logger, "Count of number of predicted chains that " "length difference is 1 to 10: {}," "percentage: {}".format( length_stat["diff_1_to_10"], round(length_stat["diff_1_to_10"] / len(result['chain_name']), 4))) print_and_log( logger, "Count of number of predicted chains that " "length difference is 10 to 100: {}, " "percentage: {}".format( length_stat["diff_10_to_100"], round( length_stat["diff_10_to_100"] / len(result['chain_name']), 4))) print_and_log( logger, "Count of number of predicted chains that " "length difference is 100 and above: {}, " "percentage: {}".format( length_stat["diff_100+"], round(length_stat["diff_100+"] / len(result['chain_name']), 4))) return result
def user_cluster_nlg_eval(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "user cluster average bleu scores:") # print_and_log(logger, "Please install nlg-eval package!\n" # "Reference: https://github.com/Maluuba/nlg-eval") # print_and_log(logger, "After installing, please change the package " # "__init__.py file (contact: [email protected]).") sys.path.append(config['nlgeval_repo_dir']) from nlgeval import compute_individual_metrics # avg bleu avg_bleu = dict() avg_bleu = dict() avg_bleu['Bleu_1'] = list() avg_bleu['Bleu_2'] = list() avg_bleu['Bleu_3'] = list() avg_bleu['Bleu_4'] = list() result['uc_bleu1'] = list() result['uc_bleu2'] = list() result['uc_bleu3'] = list() result['uc_bleu4'] = list() for i in range(len(result['chain_name'])): if len(result['gt_all_user_cluster'][i]) == 0: gt_chain = " ".join(['no_event_in_simperiod']) else: gt_chain = " ".join( [str(ele) for ele in result['gt_all_user_cluster'][i]]) if len(result['pred_all_user_cluster'][i]) == 0: hy_chain = " ".join(['no_event_in_simperiod']) else: hy_chain = " ".join( [str(ele) for ele in result['pred_all_user_cluster'][i]]) metrics_dict = compute_individual_metrics(gt_chain, hy_chain, no_overlap=(False, True), no_skipthoughts=True, no_glove=True) result['uc_bleu1'].append(metrics_dict['Bleu_1']) avg_bleu['Bleu_1'].append(metrics_dict['Bleu_1']) if len(result['gt_all_user_cluster'][i]) >= 2: # and ( # len(result['pred_all_user_cluster'][i]) >= 2 # ): result['uc_bleu2'].append(metrics_dict['Bleu_2']) avg_bleu['Bleu_2'].append(metrics_dict['Bleu_2']) else: result['uc_bleu2'].append('null') if len(result['gt_all_user_cluster'][i]) >= 3: # and ( # len(result['pred_all_user_cluster'][i]) # ): result['uc_bleu3'].append(metrics_dict['Bleu_3']) avg_bleu['Bleu_3'].append(metrics_dict['Bleu_3']) else: result['uc_bleu3'].append('null') if len(result['gt_all_user_cluster'][i]) >= 4: # and ( # len(result['pred_all_user_cluster'][i]) >= 4 # ): result['uc_bleu4'].append(metrics_dict['Bleu_4']) avg_bleu['Bleu_4'].append(metrics_dict['Bleu_4']) else: result['uc_bleu4'].append('null') for metric in avg_bleu: print_and_log( logger, "{}: {}".format(metric, round(np.average(avg_bleu[metric]), 4))) # print_and_log(logger, "{}: {}, calculated from {} values".format( # metric, round(np.average(avg_bleu[metric]), 4), # len(avg_bleu[metric]))) # pdb.set_trace() return result
def result_evaluation_given_gt(config, result_save_path, only_has_event=False): # load result pickle with open(os.path.join(result_save_path, 'result.pickle'), 'rb') as handle: result = pickle.load(handle) print('result.pickle loaded!') print("result.keys: {}\n\n".format(result.keys())) if only_has_event: result_new = dict() result_new['chain_name'] = list() result_new['pred_all_event_id'] = list() result_new['pred_all_event_type'] = list() result_new['pred_all_time_delay'] = list() result_new['pred_all_user_cluster'] = list() result_new['gt_all_event_id'] = list() result_new['gt_all_event_type'] = list() result_new['gt_all_time_delay'] = list() result_new['gt_all_user_cluster'] = list() for i in range(len(result['chain_name'])): if len(result['gt_all_event_id'][i]) != 0: result_new['chain_name'].append(result['chain_name'][i]) result_new['pred_all_event_id'].append( result['pred_all_event_id'][i]) result_new['pred_all_event_type'].append( result['pred_all_event_type'][i]) result_new['pred_all_time_delay'].append( result['pred_all_time_delay'][i]) result_new['pred_all_user_cluster'].append( result['pred_all_user_cluster'][i]) result_new['gt_all_event_id'].append( result['gt_all_event_id'][i]) result_new['gt_all_event_type'].append( result['gt_all_event_type'][i]) result_new['gt_all_time_delay'].append( result['gt_all_time_delay'][i]) result_new['gt_all_user_cluster'].append( result['gt_all_user_cluster'][i]) result = result_new # logger if only_has_event: logger = set_logger( os.path.join( result_save_path, 'evaluate_only_has_event_given_gt_' + dt.now().strftime("%Y-%m-%dT%H-%M-%SZ") + '.log')) else: logger = set_logger( os.path.join( result_save_path, 'evaluate_all_given_gt_' + dt.now().strftime("%Y-%m-%dT%H-%M-%SZ") + '.log')) print_and_log( logger, "Evaluation over {} simulated chains...".format( len(result['chain_name']))) # evaluation proceses if config['event_type_nlg_eval']: result = event_type_nlg_eval(config, logger, result) if config['event_type_map_eval']: result = event_type_map_eval_given_gt(config, logger, result) result = event_type_categorical_accuracy_eval_given_gt( config, logger, result) if config['event_type_percentage_eval']: result = event_type_percentage_eval(config, logger, result) if config['user_cluster_nlg_eval']: result = user_cluster_nlg_eval(config, logger, result) if config['user_cluster_map_eval']: result = user_cluster_map_eval_given_gt(config, logger, result) result = user_cluster_categorical_accuracy_eval_given_gt( config, logger, result) if config['user_cluster_percentage_eval']: result = user_cluster_percentage_eval(config, logger, result) if config['time_delay_overall_evaluation']: if not only_has_event: result = time_delay_overall_evaluation(config, logger, result, result_save_path, plot_ts=config['plot_ts'], chain_length_eval=False) else: result = time_delay_overall_evaluation(config, logger, result, result_save_path, plot_ts=False, chain_length_eval=False) write_result_to_file(config, result, logger) del logger return
def user_cluster_percentage_eval(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "user cluster distribution evaluation:") gt_class_list = [] pred_class_list = [] for i in range(len(result['chain_name'])): gt_class_list += result['gt_all_user_cluster'][i] pred_class_list += result['pred_all_user_cluster'][i] gt_class_list_counter = Counter(gt_class_list) pred_class_list_counter = Counter(pred_class_list) clusters = list(range(100 + 1)) counts_per_class = [] for i in range(len(clusters)): counts_per_class.append(gt_class_list_counter[i]) gt_distribution = cal_distribution(counts_per_class) counts_per_class = [] for i in range(len(clusters)): counts_per_class.append(pred_class_list_counter[i]) pred_distribution = cal_distribution(counts_per_class) print_and_log(logger, "!!!! ground truth distribution: ") for i in range(len(clusters)): print_and_log(logger, "{}: {}".format(i, round(gt_distribution[i], 4))) print_and_log(logger, "!!!! prediction distribution: ") for i in range(len(clusters)): print_and_log(logger, "{}: {}".format(i, round(pred_distribution[i], 4))) return result
def event_type_percentage_eval(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "event type distribution evaluation:") gt_class_list = [] pred_class_list = [] for i in range(len(result['chain_name'])): gt_class_list += result['gt_all_event_type'][i] pred_class_list += result['pred_all_event_type'][i] gt_class_list_counter = Counter(gt_class_list) pred_class_list_counter = Counter(pred_class_list) eventtype_2_id = dict() for key in config['eventtype_2_id']: eventtype_2_id[key] = config['eventtype_2_id'][key] - 1 id_2_eventtype = dict(zip(eventtype_2_id.values(), eventtype_2_id.keys())) # pdb.set_trace() counts_per_class = [] for i in range(len(id_2_eventtype)): et = id_2_eventtype[i] counts_per_class.append(gt_class_list_counter[et]) gt_distribution = cal_distribution(counts_per_class) counts_per_class = [] for i in range(len(id_2_eventtype)): et = id_2_eventtype[i] counts_per_class.append(pred_class_list_counter[et]) pred_distribution = cal_distribution(counts_per_class) print_and_log(logger, "!!!! ground truth distribution: ") for i in range(len(id_2_eventtype)): et = id_2_eventtype[i] print_and_log(logger, "{}: {}".format(et, round(gt_distribution[i], 4))) print_and_log(logger, "!!!! prediction distribution: ") for i in range(len(id_2_eventtype)): et = id_2_eventtype[i] print_and_log(logger, "{}: {}".format(et, round(pred_distribution[i], 4))) return result