Esempio n. 1
0
    def load_data_from_pickle(self):

        print('Loading Preprocessed Data...')

        x_train = utils.load_pkl_to_data(self.preprocessed_path + 'x_train_gan.p')
        x_test = utils.load_pkl_to_data(self.preprocessed_path + 'x_test_gan.p')

        return x_train, x_test
Esempio n. 2
0
    def train(self, load_pickle=False, load_pickle_path=None):
        """
            Training the model
        """
        start_time = time.time()

        path_list = [
            self.pred_path + 'multiclass/', self.pred_path + 'pred_era/',
            self.pred_path + 'pred_era/final_results/',
            self.pred_path + 'final_results/'
        ]
        utils.check_dir(path_list)

        print('======================================================')
        print('Start Training PrejudgeMultiClass...')

        if load_pickle:

            # Load era_sign_test
            if load_pickle_path is None:
                era_sign_test = utils.load_pkl_to_data(
                    self.prejudged_data_path + 'multiclass_era_sign_test.p')
            else:
                era_sign_test = utils.load_pkl_to_data(load_pickle_path)

        else:

            # Training Era Sign
            era_sign_test = self.predict_era_sign()

            # era_sign_test = self.load_era_sign_csv(self.pred_path + 'pred_era/final_results/lgb_result.csv')

            # Save era_sign_test to Pickle File
            utils.save_data_to_pkl(
                era_sign_test,
                self.prejudged_data_path + 'multiclass_era_sign_test.p')

        # Get Split Data
        x_test, x_g_test, id_test, x_test_idx = self.split_test_set_by_era_sign(
            era_sign_test)
        x_train, x_g_train, y_train, w_train, e_train = self.split_train_set_by_era(
        )

        # Training Models by Era Sign
        prob_test = \
            self.train_models_by_era_sign(x_train, x_g_train, y_train, w_train, e_train,
                                          x_test, x_g_test, id_test, x_test_idx)

        # Save Predictions
        utils.save_pred_to_csv(self.pred_path + 'final_results/prejudge_',
                               self.id_test, prob_test)

        total_time = time.time() - start_time
        print('======================================================')
        print('Training Done!')
        print('Total Time: {}s'.format(total_time))
        print('======================================================')
Esempio n. 3
0
 def random_split(x, y, n_splits=None, n_cv=None, cv_seed=None):
     train_data = utils.load_pkl_to_data(
         './data/preprocessed_data/x_g_train.p')
     data_mt = np.array(train_data)
     index = data_mt[:, 2]
     # station_list = index.tolist()
     # min_number = 10000
     # for i in np.unique(index):
     #     if min_number > station_list.count(i):
     #         min_number = station_list.count(i)
     # if n_splits > min_number:
     #     raise ValueError(
     #         '--The least populated station  has only %d members,please input new cv_number--' % min_number)
     cv_count = 0
     skf = StratifiedKFold(n_splits=n_cv,
                           shuffle=True,
                           random_state=cv_seed)
     for train_index, valid_index in skf.split(index, index):
         # Training data
         x_train = x[train_index]
         y_train = y[train_index]
         # Validation data
         x_valid = x[valid_index]
         y_valid = y[valid_index]
         cv_count += 1
         utils.print_cv_info(cv_count, n_cv)
         yield x_train, y_train, x_valid, y_valid
Esempio n. 4
0
    def __init__(self, seed):

        utils.check_dir([tampered_pred_path])
        np.random.seed(seed)

        base_result = pd.read_csv(base_fake_result_path + '0.5_fake_result.csv', header=0, dtype=np.float64)
        self.prob = np.array(base_result['proba'], dtype=np.float64)
        self.id_test = utils.load_pkl_to_data(preprocessed_data_path + 'id_test.p')
        self.same_idx_list = utils.load_pkl_to_data(preprocessed_data_path + 'same_test_idx_pairs.p')
        self.code_id_train, self.code_id_test = utils.load_preprocessed_code_id(preprocessed_data_path)
        same_test_df = pd.read_csv(preprocessed_data_path + 'same_test_pairs.csv', header=0, dtype=np.float64)
        self.same_test_code_id = same_test_df['code_id']
        self.same_test_id = same_test_df['id']

        self.test_id_to_idx_dict = {}
        for idx, id_ in enumerate(self.id_test):
            self.test_id_to_idx_dict[id_] = idx
Esempio n. 5
0
    def generate_tampered_results_by_weight(self, n_pairs):

        w_train = utils.load_pkl_to_data(preprocessed_data_path + 'w_train.p')

        print('------------------------------------------------------')
        print('Calculating Big Weight Same Pairs...')
        print('------------------------------------------------------')
        print('Sorting...')
        sorted_by_weight_idx = np.argsort(w_train)[:-n_pairs*3:-1]
        sorted_w_train = w_train[sorted_by_weight_idx]
        sorted_code_id_train = self.code_id_train[sorted_by_weight_idx]

        print('Deduplicating...')
        big_weight_code_id = []
        big_weight_w_train = []
        for idx, code_id in enumerate(sorted_code_id_train):
            if code_id not in big_weight_code_id:
                if code_id in set(self.same_test_code_id):
                    big_weight_code_id.append(code_id)
                    big_weight_w_train.append(sorted_w_train[idx])

        print('Generating Pairs...')
        idx_pair_list, w_train_col, code_id_col, id_col = \
            self.get_pair_list(n_pairs, big_weight_code_id, use_weight=True, w_train_list=big_weight_w_train)

        print('------------------------------------------------------')
        print('Number of Big Weight Same Pairs: {}'.format(len(idx_pair_list)))
        utils.save_data_to_pkl(idx_pair_list, preprocessed_data_path + 'big_weight_idx_pairs.p')

        index = []
        for i in range(1, len(idx_pair_list)+1):
            index.extend([i, i])
        df_log = pd.DataFrame({'index': np.array(index, dtype=int),
                               'weight': np.array(w_train_col),
                               'code_id': np.array(code_id_col, dtype=int),
                               'id': np.array(id_col, dtype=int)})
        cols = ['index', 'weight', 'code_id', 'id']
        df_log = df_log.loc[:, cols]
        tampered_pred_path_ = tampered_pred_path + 'big_weight_tampered_log.csv'

        print('------------------------------------------------------')
        print('Saving {} ...'.format(tampered_pred_path_))
        df_log.to_csv(tampered_pred_path_, sep=',', index=False)

        # Save Same Pairs csv file
        self.save_same_pairs_test_csv(preprocessed_data_path + 'big_weight_same_pairs.csv', idx_pair_list)

        # Generate Tampered Results
        self.tamper_result(idx_pair_list, 'big_weight')
Esempio n. 6
0
    def generate_tampered_results_by_absence(self, n_pairs):

        diff_code_id_test = utils.load_pkl_to_data(preprocessed_data_path + 'diff_code_id_test.p')

        print('------------------------------------------------------')
        print('Calculating Absent Same Pairs...')
        print('------------------------------------------------------')
        print('Sorting...')
        diff_code_id_test = np.sort(diff_code_id_test)
        absent_code_id = diff_code_id_test[:-n_pairs*3:-1]

        print('Generating Pairs...')
        idx_pair_list, code_id_col, id_col = self.get_pair_list(n_pairs, absent_code_id)

        print('------------------------------------------------------')
        print('Number of Absent Same Pairs: {}'.format(len(idx_pair_list)))
        utils.save_data_to_pkl(idx_pair_list, preprocessed_data_path + 'absent_idx_pairs.p')

        index = []
        for i in range(1, len(idx_pair_list)+1):
            index.extend([i, i])
        df_log = pd.DataFrame({'index': np.array(index, dtype=int),
                               'code_id': np.array(code_id_col, dtype=int),
                               'id': np.array(id_col, dtype=int)})
        cols = ['index', 'code_id', 'id']
        df_log = df_log.loc[:, cols]
        tampered_pred_path_ = tampered_pred_path + 'absent_tampered_log.csv'

        print('------------------------------------------------------')
        print('Saving {} ...'.format(tampered_pred_path_))
        df_log.to_csv(tampered_pred_path_, sep=',', index=False)

        # Save Same Pairs csv file
        self.save_same_pairs_test_csv(preprocessed_data_path + 'absent_same_pairs.csv', idx_pair_list)

        # Generate Tampered Results
        self.tamper_result(idx_pair_list, 'absent')
Esempio n. 7
0
    def train(self, load_pickle=False, load_pickle_path=None):
        """
            Training the model
        """
        start_time = time.time()

        path_list = [
            self.pred_path + 'positive/', self.pred_path + 'negative/',
            self.pred_path + 'pred_era/',
            self.pred_path + 'pred_era/final_results/',
            self.pred_path + 'final_results/',
            self.loss_log_path + 'positive/', self.loss_log_path + 'negative/'
        ]
        utils.check_dir(path_list)

        print('======================================================')
        print('Start Training PrejudgeBinary...')

        if load_pickle:

            # Load era_sign_test
            if load_pickle_path is None:
                era_sign_test = utils.load_pkl_to_data(
                    self.prejudged_data_path + 'binary_era_sign_test.p')
            else:
                era_sign_test = utils.load_pkl_to_data(load_pickle_path)

        else:

            # Training Era Sign
            era_sign_test = self.predict_era_sign()

            # era_sign_test = self.load_era_sign_csv(self.pred_path + 'pred_era/final_results/lgb_result.csv')

            # Save era_sign_test to Pickle File
            utils.save_data_to_pkl(
                era_sign_test,
                self.prejudged_data_path + 'binary_era_sign_test.p')

        # Print Prediction of Positive Era Rate
        utils.print_positive_rate_test(era_sign_test)

        # Get Split Data
        x_test_p, x_g_test_p, id_test_p, era_idx_test_p, x_test_n, \
            x_g_test_n, id_test_n, era_idx_test_n = self.split_test_set_by_era_sign(era_sign_test)

        # Training Models by Era Sign
        prob_test = \
            self.train_models_by_era_sign(x_test_p, x_g_test_p, id_test_p, era_idx_test_p,
                                          x_test_n, x_g_test_n, id_test_n, era_idx_test_n)

        # Save Predictions
        utils.save_pred_to_csv(self.pred_path + 'final_results/prejudge_',
                               self.id_test, prob_test)

        # Print Prediction of Positive Era Rate
        utils.print_positive_rate_test(era_sign_test)

        total_time = time.time() - start_time
        print('======================================================')
        print('Training Done!')
        print('Total Time: {}s'.format(total_time))
        print('======================================================')
Esempio n. 8
0
    def split_data_by_gan(self,
                          load_pickle=True,
                          sample_ratio=None,
                          sample_by_era=True,
                          generate_mode='valid'):

        print('======================================================')
        print('Splitting Adversarial Validation Set by GAN...')

        if load_pickle:
            similarity_prob = utils.load_pkl_to_data(cfg.gan_prob_path +
                                                     'similarity_prob.p')
        else:
            similarity_prob = \
                GenerateValidation.train(train_path=cfg.train_csv_path, test_path=cfg.test_csv_path, global_epochs=1,
                                         similarity_prob_path=cfg.gan_prob_path, return_similarity_prob=True,
                                         load_preprocessed_data=True)

        valid_idx = []
        train_idx = []

        if sample_by_era:

            similarity_prob_e = []
            index_e = []
            similarity_prob_all = []
            index_all = []
            era_tag = 0
            era_all = [era_tag]

            for idx, era in enumerate(self.e_train):

                if idx == len(self.e_train) - 1:
                    similarity_prob_e.append(similarity_prob[idx])
                    index_e.append(idx)
                    similarity_prob_all.append(similarity_prob_e)
                    index_all.append(index_e)
                elif era_tag == era:
                    similarity_prob_e.append(similarity_prob[idx])
                    index_e.append(idx)
                else:
                    era_tag = era
                    era_all.append(era)
                    similarity_prob_all.append(similarity_prob_e)
                    index_all.append(index_e)
                    similarity_prob_e = [similarity_prob[idx]]
                    index_e = [idx]

            for e, similarity_prob_e in enumerate(similarity_prob_all):

                n_sample_e = int(len(similarity_prob_e) * sample_ratio)
                most_similar_idx_e = np.argsort(
                    similarity_prob_e)[:, :-(n_sample_e + 1):-1]
                least_similar_idx_e = np.argsort(
                    similarity_prob_e)[:, :len(similarity_prob_e) - n_sample_e]

                if generate_mode == 'valid':
                    valid_idx += list(index_all[e][most_similar_idx_e])
                    train_idx += list(index_all[e][least_similar_idx_e])
                elif generate_mode == 'train':
                    train_idx += list(index_all[e][most_similar_idx_e])
                    valid_idx += list(index_all[e][least_similar_idx_e])
                else:
                    raise ValueError("Wrong 'generate_mode'!")
        else:

            n_sample = int(len(similarity_prob) * sample_ratio)
            most_similar_idx = np.argsort(similarity_prob)[:, :-(n_sample +
                                                                 1):-1]
            least_similar_idx = np.argsort(
                similarity_prob)[:, :len(similarity_prob) - n_sample]

            if generate_mode == 'valid':
                valid_idx = most_similar_idx
                train_idx = least_similar_idx
            elif generate_mode == 'train':
                train_idx = least_similar_idx
                valid_idx = most_similar_idx
            else:
                raise ValueError("Wrong 'generate_mode'!")

        # Generate Validation Set
        self.x_valid = self.x_train[valid_idx]
        self.y_valid = self.x_train[valid_idx]

        # Generate Training Set
        self.x_train = self.x_train[train_idx]
        self.y_train = self.y_train[train_idx]
        self.w_train = self.w_train[train_idx]
        self.e_train = self.e_train[train_idx]

        if group_list is not None:
            self.x_g_valid = self.x_g_train[valid_idx]
            self.x_g_train = self.x_g_train[train_idx]

        # Save Adversarial Validation Set
        print('Saving Adversarial Validation Set...')
        utils.save_data_to_pkl(self.x_valid,
                               self.preprocess_path + 'x_valid.p')
        utils.save_data_to_pkl(self.x_g_valid,
                               self.preprocess_path + 'x_g_valid.p')
        utils.save_data_to_pkl(self.y_valid,
                               self.preprocess_path + 'y_valid.p')