def load_data_from_pickle(self): print('Loading Preprocessed Data...') x_train = utils.load_pkl_to_data(self.preprocessed_path + 'x_train_gan.p') x_test = utils.load_pkl_to_data(self.preprocessed_path + 'x_test_gan.p') return x_train, x_test
def train(self, load_pickle=False, load_pickle_path=None): """ Training the model """ start_time = time.time() path_list = [ self.pred_path + 'multiclass/', self.pred_path + 'pred_era/', self.pred_path + 'pred_era/final_results/', self.pred_path + 'final_results/' ] utils.check_dir(path_list) print('======================================================') print('Start Training PrejudgeMultiClass...') if load_pickle: # Load era_sign_test if load_pickle_path is None: era_sign_test = utils.load_pkl_to_data( self.prejudged_data_path + 'multiclass_era_sign_test.p') else: era_sign_test = utils.load_pkl_to_data(load_pickle_path) else: # Training Era Sign era_sign_test = self.predict_era_sign() # era_sign_test = self.load_era_sign_csv(self.pred_path + 'pred_era/final_results/lgb_result.csv') # Save era_sign_test to Pickle File utils.save_data_to_pkl( era_sign_test, self.prejudged_data_path + 'multiclass_era_sign_test.p') # Get Split Data x_test, x_g_test, id_test, x_test_idx = self.split_test_set_by_era_sign( era_sign_test) x_train, x_g_train, y_train, w_train, e_train = self.split_train_set_by_era( ) # Training Models by Era Sign prob_test = \ self.train_models_by_era_sign(x_train, x_g_train, y_train, w_train, e_train, x_test, x_g_test, id_test, x_test_idx) # Save Predictions utils.save_pred_to_csv(self.pred_path + 'final_results/prejudge_', self.id_test, prob_test) total_time = time.time() - start_time print('======================================================') print('Training Done!') print('Total Time: {}s'.format(total_time)) print('======================================================')
def random_split(x, y, n_splits=None, n_cv=None, cv_seed=None): train_data = utils.load_pkl_to_data( './data/preprocessed_data/x_g_train.p') data_mt = np.array(train_data) index = data_mt[:, 2] # station_list = index.tolist() # min_number = 10000 # for i in np.unique(index): # if min_number > station_list.count(i): # min_number = station_list.count(i) # if n_splits > min_number: # raise ValueError( # '--The least populated station has only %d members,please input new cv_number--' % min_number) cv_count = 0 skf = StratifiedKFold(n_splits=n_cv, shuffle=True, random_state=cv_seed) for train_index, valid_index in skf.split(index, index): # Training data x_train = x[train_index] y_train = y[train_index] # Validation data x_valid = x[valid_index] y_valid = y[valid_index] cv_count += 1 utils.print_cv_info(cv_count, n_cv) yield x_train, y_train, x_valid, y_valid
def __init__(self, seed): utils.check_dir([tampered_pred_path]) np.random.seed(seed) base_result = pd.read_csv(base_fake_result_path + '0.5_fake_result.csv', header=0, dtype=np.float64) self.prob = np.array(base_result['proba'], dtype=np.float64) self.id_test = utils.load_pkl_to_data(preprocessed_data_path + 'id_test.p') self.same_idx_list = utils.load_pkl_to_data(preprocessed_data_path + 'same_test_idx_pairs.p') self.code_id_train, self.code_id_test = utils.load_preprocessed_code_id(preprocessed_data_path) same_test_df = pd.read_csv(preprocessed_data_path + 'same_test_pairs.csv', header=0, dtype=np.float64) self.same_test_code_id = same_test_df['code_id'] self.same_test_id = same_test_df['id'] self.test_id_to_idx_dict = {} for idx, id_ in enumerate(self.id_test): self.test_id_to_idx_dict[id_] = idx
def generate_tampered_results_by_weight(self, n_pairs): w_train = utils.load_pkl_to_data(preprocessed_data_path + 'w_train.p') print('------------------------------------------------------') print('Calculating Big Weight Same Pairs...') print('------------------------------------------------------') print('Sorting...') sorted_by_weight_idx = np.argsort(w_train)[:-n_pairs*3:-1] sorted_w_train = w_train[sorted_by_weight_idx] sorted_code_id_train = self.code_id_train[sorted_by_weight_idx] print('Deduplicating...') big_weight_code_id = [] big_weight_w_train = [] for idx, code_id in enumerate(sorted_code_id_train): if code_id not in big_weight_code_id: if code_id in set(self.same_test_code_id): big_weight_code_id.append(code_id) big_weight_w_train.append(sorted_w_train[idx]) print('Generating Pairs...') idx_pair_list, w_train_col, code_id_col, id_col = \ self.get_pair_list(n_pairs, big_weight_code_id, use_weight=True, w_train_list=big_weight_w_train) print('------------------------------------------------------') print('Number of Big Weight Same Pairs: {}'.format(len(idx_pair_list))) utils.save_data_to_pkl(idx_pair_list, preprocessed_data_path + 'big_weight_idx_pairs.p') index = [] for i in range(1, len(idx_pair_list)+1): index.extend([i, i]) df_log = pd.DataFrame({'index': np.array(index, dtype=int), 'weight': np.array(w_train_col), 'code_id': np.array(code_id_col, dtype=int), 'id': np.array(id_col, dtype=int)}) cols = ['index', 'weight', 'code_id', 'id'] df_log = df_log.loc[:, cols] tampered_pred_path_ = tampered_pred_path + 'big_weight_tampered_log.csv' print('------------------------------------------------------') print('Saving {} ...'.format(tampered_pred_path_)) df_log.to_csv(tampered_pred_path_, sep=',', index=False) # Save Same Pairs csv file self.save_same_pairs_test_csv(preprocessed_data_path + 'big_weight_same_pairs.csv', idx_pair_list) # Generate Tampered Results self.tamper_result(idx_pair_list, 'big_weight')
def generate_tampered_results_by_absence(self, n_pairs): diff_code_id_test = utils.load_pkl_to_data(preprocessed_data_path + 'diff_code_id_test.p') print('------------------------------------------------------') print('Calculating Absent Same Pairs...') print('------------------------------------------------------') print('Sorting...') diff_code_id_test = np.sort(diff_code_id_test) absent_code_id = diff_code_id_test[:-n_pairs*3:-1] print('Generating Pairs...') idx_pair_list, code_id_col, id_col = self.get_pair_list(n_pairs, absent_code_id) print('------------------------------------------------------') print('Number of Absent Same Pairs: {}'.format(len(idx_pair_list))) utils.save_data_to_pkl(idx_pair_list, preprocessed_data_path + 'absent_idx_pairs.p') index = [] for i in range(1, len(idx_pair_list)+1): index.extend([i, i]) df_log = pd.DataFrame({'index': np.array(index, dtype=int), 'code_id': np.array(code_id_col, dtype=int), 'id': np.array(id_col, dtype=int)}) cols = ['index', 'code_id', 'id'] df_log = df_log.loc[:, cols] tampered_pred_path_ = tampered_pred_path + 'absent_tampered_log.csv' print('------------------------------------------------------') print('Saving {} ...'.format(tampered_pred_path_)) df_log.to_csv(tampered_pred_path_, sep=',', index=False) # Save Same Pairs csv file self.save_same_pairs_test_csv(preprocessed_data_path + 'absent_same_pairs.csv', idx_pair_list) # Generate Tampered Results self.tamper_result(idx_pair_list, 'absent')
def train(self, load_pickle=False, load_pickle_path=None): """ Training the model """ start_time = time.time() path_list = [ self.pred_path + 'positive/', self.pred_path + 'negative/', self.pred_path + 'pred_era/', self.pred_path + 'pred_era/final_results/', self.pred_path + 'final_results/', self.loss_log_path + 'positive/', self.loss_log_path + 'negative/' ] utils.check_dir(path_list) print('======================================================') print('Start Training PrejudgeBinary...') if load_pickle: # Load era_sign_test if load_pickle_path is None: era_sign_test = utils.load_pkl_to_data( self.prejudged_data_path + 'binary_era_sign_test.p') else: era_sign_test = utils.load_pkl_to_data(load_pickle_path) else: # Training Era Sign era_sign_test = self.predict_era_sign() # era_sign_test = self.load_era_sign_csv(self.pred_path + 'pred_era/final_results/lgb_result.csv') # Save era_sign_test to Pickle File utils.save_data_to_pkl( era_sign_test, self.prejudged_data_path + 'binary_era_sign_test.p') # Print Prediction of Positive Era Rate utils.print_positive_rate_test(era_sign_test) # Get Split Data x_test_p, x_g_test_p, id_test_p, era_idx_test_p, x_test_n, \ x_g_test_n, id_test_n, era_idx_test_n = self.split_test_set_by_era_sign(era_sign_test) # Training Models by Era Sign prob_test = \ self.train_models_by_era_sign(x_test_p, x_g_test_p, id_test_p, era_idx_test_p, x_test_n, x_g_test_n, id_test_n, era_idx_test_n) # Save Predictions utils.save_pred_to_csv(self.pred_path + 'final_results/prejudge_', self.id_test, prob_test) # Print Prediction of Positive Era Rate utils.print_positive_rate_test(era_sign_test) total_time = time.time() - start_time print('======================================================') print('Training Done!') print('Total Time: {}s'.format(total_time)) print('======================================================')
def split_data_by_gan(self, load_pickle=True, sample_ratio=None, sample_by_era=True, generate_mode='valid'): print('======================================================') print('Splitting Adversarial Validation Set by GAN...') if load_pickle: similarity_prob = utils.load_pkl_to_data(cfg.gan_prob_path + 'similarity_prob.p') else: similarity_prob = \ GenerateValidation.train(train_path=cfg.train_csv_path, test_path=cfg.test_csv_path, global_epochs=1, similarity_prob_path=cfg.gan_prob_path, return_similarity_prob=True, load_preprocessed_data=True) valid_idx = [] train_idx = [] if sample_by_era: similarity_prob_e = [] index_e = [] similarity_prob_all = [] index_all = [] era_tag = 0 era_all = [era_tag] for idx, era in enumerate(self.e_train): if idx == len(self.e_train) - 1: similarity_prob_e.append(similarity_prob[idx]) index_e.append(idx) similarity_prob_all.append(similarity_prob_e) index_all.append(index_e) elif era_tag == era: similarity_prob_e.append(similarity_prob[idx]) index_e.append(idx) else: era_tag = era era_all.append(era) similarity_prob_all.append(similarity_prob_e) index_all.append(index_e) similarity_prob_e = [similarity_prob[idx]] index_e = [idx] for e, similarity_prob_e in enumerate(similarity_prob_all): n_sample_e = int(len(similarity_prob_e) * sample_ratio) most_similar_idx_e = np.argsort( similarity_prob_e)[:, :-(n_sample_e + 1):-1] least_similar_idx_e = np.argsort( similarity_prob_e)[:, :len(similarity_prob_e) - n_sample_e] if generate_mode == 'valid': valid_idx += list(index_all[e][most_similar_idx_e]) train_idx += list(index_all[e][least_similar_idx_e]) elif generate_mode == 'train': train_idx += list(index_all[e][most_similar_idx_e]) valid_idx += list(index_all[e][least_similar_idx_e]) else: raise ValueError("Wrong 'generate_mode'!") else: n_sample = int(len(similarity_prob) * sample_ratio) most_similar_idx = np.argsort(similarity_prob)[:, :-(n_sample + 1):-1] least_similar_idx = np.argsort( similarity_prob)[:, :len(similarity_prob) - n_sample] if generate_mode == 'valid': valid_idx = most_similar_idx train_idx = least_similar_idx elif generate_mode == 'train': train_idx = least_similar_idx valid_idx = most_similar_idx else: raise ValueError("Wrong 'generate_mode'!") # Generate Validation Set self.x_valid = self.x_train[valid_idx] self.y_valid = self.x_train[valid_idx] # Generate Training Set self.x_train = self.x_train[train_idx] self.y_train = self.y_train[train_idx] self.w_train = self.w_train[train_idx] self.e_train = self.e_train[train_idx] if group_list is not None: self.x_g_valid = self.x_g_train[valid_idx] self.x_g_train = self.x_g_train[train_idx] # Save Adversarial Validation Set print('Saving Adversarial Validation Set...') utils.save_data_to_pkl(self.x_valid, self.preprocess_path + 'x_valid.p') utils.save_data_to_pkl(self.x_g_valid, self.preprocess_path + 'x_g_valid.p') utils.save_data_to_pkl(self.y_valid, self.preprocess_path + 'y_valid.p')