コード例 #1
0
ファイル: tsne.py プロジェクト: leiseraiesecqd/acgc
    def train(self, parameters):

        tsne = TSNE(**parameters)
        tsne_outputs = tsne.fit_transform(self.x_train)

        utils.save_data_to_pkl(tsne_outputs,
                               tsne_outputs_path + 'tsne_outputs.p')
コード例 #2
0
ファイル: prejudge.py プロジェクト: LeanderLXZ/stock
    def train(self, load_pickle=False, load_pickle_path=None):
        """
            Training the model
        """
        start_time = time.time()

        path_list = [
            self.pred_path + 'multiclass/', self.pred_path + 'pred_era/',
            self.pred_path + 'pred_era/final_results/',
            self.pred_path + 'final_results/'
        ]
        utils.check_dir(path_list)

        print('======================================================')
        print('Start Training PrejudgeMultiClass...')

        if load_pickle:

            # Load era_sign_test
            if load_pickle_path is None:
                era_sign_test = utils.load_pkl_to_data(
                    self.prejudged_data_path + 'multiclass_era_sign_test.p')
            else:
                era_sign_test = utils.load_pkl_to_data(load_pickle_path)

        else:

            # Training Era Sign
            era_sign_test = self.predict_era_sign()

            # era_sign_test = self.load_era_sign_csv(self.pred_path + 'pred_era/final_results/lgb_result.csv')

            # Save era_sign_test to Pickle File
            utils.save_data_to_pkl(
                era_sign_test,
                self.prejudged_data_path + 'multiclass_era_sign_test.p')

        # Get Split Data
        x_test, x_g_test, id_test, x_test_idx = self.split_test_set_by_era_sign(
            era_sign_test)
        x_train, x_g_train, y_train, w_train, e_train = self.split_train_set_by_era(
        )

        # Training Models by Era Sign
        prob_test = \
            self.train_models_by_era_sign(x_train, x_g_train, y_train, w_train, e_train,
                                          x_test, x_g_test, id_test, x_test_idx)

        # Save Predictions
        utils.save_pred_to_csv(self.pred_path + 'final_results/prejudge_',
                               self.id_test, prob_test)

        total_time = time.time() - start_time
        print('======================================================')
        print('Training Done!')
        print('Total Time: {}s'.format(total_time))
        print('======================================================')
コード例 #3
0
    def generate_tampered_results_by_weight(self, n_pairs):

        w_train = utils.load_pkl_to_data(preprocessed_data_path + 'w_train.p')

        print('------------------------------------------------------')
        print('Calculating Big Weight Same Pairs...')
        print('------------------------------------------------------')
        print('Sorting...')
        sorted_by_weight_idx = np.argsort(w_train)[:-n_pairs*3:-1]
        sorted_w_train = w_train[sorted_by_weight_idx]
        sorted_code_id_train = self.code_id_train[sorted_by_weight_idx]

        print('Deduplicating...')
        big_weight_code_id = []
        big_weight_w_train = []
        for idx, code_id in enumerate(sorted_code_id_train):
            if code_id not in big_weight_code_id:
                if code_id in set(self.same_test_code_id):
                    big_weight_code_id.append(code_id)
                    big_weight_w_train.append(sorted_w_train[idx])

        print('Generating Pairs...')
        idx_pair_list, w_train_col, code_id_col, id_col = \
            self.get_pair_list(n_pairs, big_weight_code_id, use_weight=True, w_train_list=big_weight_w_train)

        print('------------------------------------------------------')
        print('Number of Big Weight Same Pairs: {}'.format(len(idx_pair_list)))
        utils.save_data_to_pkl(idx_pair_list, preprocessed_data_path + 'big_weight_idx_pairs.p')

        index = []
        for i in range(1, len(idx_pair_list)+1):
            index.extend([i, i])
        df_log = pd.DataFrame({'index': np.array(index, dtype=int),
                               'weight': np.array(w_train_col),
                               'code_id': np.array(code_id_col, dtype=int),
                               'id': np.array(id_col, dtype=int)})
        cols = ['index', 'weight', 'code_id', 'id']
        df_log = df_log.loc[:, cols]
        tampered_pred_path_ = tampered_pred_path + 'big_weight_tampered_log.csv'

        print('------------------------------------------------------')
        print('Saving {} ...'.format(tampered_pred_path_))
        df_log.to_csv(tampered_pred_path_, sep=',', index=False)

        # Save Same Pairs csv file
        self.save_same_pairs_test_csv(preprocessed_data_path + 'big_weight_same_pairs.csv', idx_pair_list)

        # Generate Tampered Results
        self.tamper_result(idx_pair_list, 'big_weight')
コード例 #4
0
    def main(self):

        print('Split Data Set by Code ID...')
        print('------------------------------------------------------')

        x_test_list, id_test_list, code_id_list, test_idx_list = self.split_data_by_code_id(
        )

        same_id_list = []
        same_idx_list = []

        print('Searching Same ID Pairs...')
        print('------------------------------------------------------')

        for i in tqdm.trange(len(id_test_list)):

            x_test_i, id_test_i, code_id_i, test_idx_i = \
                x_test_list[i], id_test_list[i], code_id_list[i], test_idx_list[i]

            same_id_list_c, same_idx_list_c = self.get_same_id_list(
                x_test_i, id_test_i, test_idx_i)
            same_id_list.extend(same_id_list_c)
            same_idx_list.extend(same_idx_list_c)

        print('------------------------------------------------------')
        print('Same Code Pairs: {}'.format(len(same_idx_list)))
        print('------------------------------------------------------')
        print('Saving same_test_pairs.csv...')

        same_idx = np.concatenate(np.array(same_idx_list)).tolist()
        test_f = pd.read_csv(test_path, header=0, dtype=np.float64)
        df = test_f.iloc[same_idx]
        cols = [
            'code_id', *['feature{}'.format(i) for i in range(97)], 'group1',
            'group2', 'id'
        ]
        df = df.loc[:, cols]
        df.to_csv(preprocess_path + 'same_test_pairs.csv',
                  sep=',',
                  index=False)

        print('------------------------------------------------------')
        print('Saving same_test_idx_pairs.p...')
        utils.save_data_to_pkl(same_idx_list,
                               preprocess_path + 'same_test_idx_pairs.p')
コード例 #5
0
    def search_diff_code_id(self):

        print('Searching Different Code ID of Test Set...')
        print('------------------------------------------------------')

        diff_code_id_test = np.array(list(
            set([i for i in self.code_id_test
                 if i not in self.code_id_train])),
                                     dtype=int)
        diff_code_id_test.reshape(-1, 1)

        print('Number of diff_code_id_test: ', diff_code_id_test.shape[0])
        utils.save_data_to_pkl(diff_code_id_test,
                               preprocess_path + 'diff_code_id_test.p')
        print('Saving {} ...'.format(preprocess_path +
                                     'diff_code_id_test.csv'))
        np.savetxt(preprocess_path + 'diff_code_id_test.csv',
                   diff_code_id_test,
                   delimiter=',',
                   fmt='%d')
コード例 #6
0
    def generate_tampered_results_by_absence(self, n_pairs):

        diff_code_id_test = utils.load_pkl_to_data(preprocessed_data_path + 'diff_code_id_test.p')

        print('------------------------------------------------------')
        print('Calculating Absent Same Pairs...')
        print('------------------------------------------------------')
        print('Sorting...')
        diff_code_id_test = np.sort(diff_code_id_test)
        absent_code_id = diff_code_id_test[:-n_pairs*3:-1]

        print('Generating Pairs...')
        idx_pair_list, code_id_col, id_col = self.get_pair_list(n_pairs, absent_code_id)

        print('------------------------------------------------------')
        print('Number of Absent Same Pairs: {}'.format(len(idx_pair_list)))
        utils.save_data_to_pkl(idx_pair_list, preprocessed_data_path + 'absent_idx_pairs.p')

        index = []
        for i in range(1, len(idx_pair_list)+1):
            index.extend([i, i])
        df_log = pd.DataFrame({'index': np.array(index, dtype=int),
                               'code_id': np.array(code_id_col, dtype=int),
                               'id': np.array(id_col, dtype=int)})
        cols = ['index', 'code_id', 'id']
        df_log = df_log.loc[:, cols]
        tampered_pred_path_ = tampered_pred_path + 'absent_tampered_log.csv'

        print('------------------------------------------------------')
        print('Saving {} ...'.format(tampered_pred_path_))
        df_log.to_csv(tampered_pred_path_, sep=',', index=False)

        # Save Same Pairs csv file
        self.save_same_pairs_test_csv(preprocessed_data_path + 'absent_same_pairs.csv', idx_pair_list)

        # Generate Tampered Results
        self.tamper_result(idx_pair_list, 'absent')
コード例 #7
0
    def generate_tampered_results_by_range(self, start_code_id, n_pairs, reverse=False):

        print('------------------------------------------------------')
        print('Generating All Tampered Results...')
        stop_code_id = start_code_id+(n_pairs*3)
        range_code_id = range(start_code_id, stop_code_id)
        if reverse:
            range_code_id = range_code_id[::-1]

        print('Generating Pairs...')
        idx_pair_list, code_id_col, id_col = self.get_pair_list(n_pairs, range_code_id,)

        print('------------------------------------------------------')
        print('Number of Range Same Pairs: {}'.format(len(idx_pair_list)))
        pickle_path = preprocessed_data_path + 'range-{}-{}_idx_pairs.p'.format(start_code_id, n_pairs)
        utils.save_data_to_pkl(idx_pair_list, pickle_path)

        index = []
        for i in range(1, len(idx_pair_list) + 1):
            index.extend([i, i])
        df_log = pd.DataFrame({'index': np.array(index, dtype=int),
                               'code_id': np.array(code_id_col, dtype=int),
                               'id': np.array(id_col, dtype=int)})
        cols = ['index', 'code_id', 'id']
        df_log = df_log.loc[:, cols]
        tampered_pred_path_ = tampered_pred_path + 'range-{}-{}_tampered_log.csv'.format(start_code_id, n_pairs)

        print('------------------------------------------------------')
        print('Saving {} ...'.format(tampered_pred_path_))
        df_log.to_csv(tampered_pred_path_, sep=',', index=False)

        # Save Same Pairs csv file
        test_csv_path = preprocessed_data_path + 'range-{}-{}_same_pairs.csv'.format(start_code_id, n_pairs)
        self.save_same_pairs_test_csv(test_csv_path, idx_pair_list)

        # Generate Tampered Results
        self.tamper_result(idx_pair_list, 'range-{}-{}'.format(start_code_id, n_pairs))
コード例 #8
0
ファイル: preprocess.py プロジェクト: leiseraiesecqd/ccf
    def save_global_valid_set(self):

        print('======================================================')
        print('Saving Validation Set...')
        utils.save_data_to_pkl(self.x_valid,
                               self.preprocess_path + 'x_global_valid.p')
        utils.save_data_to_pkl(self.x_g_valid,
                               self.preprocess_path + 'x_g_global_valid.p')
        utils.save_data_to_pkl(self.y_valid,
                               self.preprocess_path + 'y_global_valid.p')
コード例 #9
0
ファイル: prejudge.py プロジェクト: LeanderLXZ/stock
    def train_models_by_era_sign(self, x_train, x_g_train, y_train, w_train,
                                 e_train, x_test, x_g_test, id_test,
                                 x_test_idx):
        """
            Training Models for Different Eras
        """
        print('======================================================')
        print('Training Models by Era Sign...')

        prob_test = np.zeros_like(self.id_test,
                                  dtype=np.float64).reshape(-1, 1).tolist()

        for model_iter in range(self.n_era):

            print('======================================================')
            print('Training Models of Era: {}/{}'.format(
                model_iter + 1, self.n_era))

            x_train_era = np.array(x_train[model_iter])
            x_g_train_era = np.array(x_g_train[model_iter])
            y_train_era = np.array(y_train[model_iter])
            w_train_era = np.array(w_train[model_iter])
            e_train_era = np.array(e_train[model_iter])
            x_test_era = np.array(x_test[model_iter])
            x_g_test_era = np.array(x_g_test[model_iter])
            id_test_era = np.array(id_test[model_iter])
            x_test_idx_era = np.array(x_test_idx[model_iter])

            print('------------------------------------------------------')
            print('Initializing Model...')
            model = self.multiclass_model_initializer(
                x_train_era, x_g_train_era, y_train_era, w_train_era,
                e_train_era, x_test_era, x_g_test_era, id_test_era)

            cv_generator = CrossValidation.random_split
            prob_test_era = model.train(self.pred_path + 'multiclass/',
                                        self.loss_log_path + 'multiclass/',
                                        csv_log_path=self.csv_log_path,
                                        n_valid=self.n_valid_m,
                                        n_cv=self.n_cv_m,
                                        train_seed=self.train_seed,
                                        cv_seed=self.cv_seed,
                                        parameters=self.parameters_m,
                                        return_prob_test=True,
                                        show_importance=self.show_importance,
                                        show_accuracy=self.show_accuracy,
                                        save_csv_log=True,
                                        csv_idx='era_{}'.format(model_iter +
                                                                1),
                                        cv_generator=cv_generator)

            utils.save_data_to_pkl(
                prob_test, self.prejudged_data_path +
                'multi_prob_test_era_{}.p'.format(model_iter + 1))

            for idx_era, prob_era in zip(x_test_idx_era, prob_test_era):
                if prob_test[idx_era][0] == 0.:
                    prob_test[idx_era][0] = prob_era
                else:
                    prob_test[idx_era].append(prob_era)

        utils.save_data_to_pkl(prob_test,
                               self.prejudged_data_path + 'multi_prob_test.p')

        # Calculate Mean of prob_test
        prob_test = np.mean(prob_test, axis=1, dtype=np.float64)

        return prob_test
コード例 #10
0
ファイル: prejudge.py プロジェクト: LeanderLXZ/stock
    def train(self, load_pickle=False, load_pickle_path=None):
        """
            Training the model
        """
        start_time = time.time()

        path_list = [
            self.pred_path + 'positive/', self.pred_path + 'negative/',
            self.pred_path + 'pred_era/',
            self.pred_path + 'pred_era/final_results/',
            self.pred_path + 'final_results/',
            self.loss_log_path + 'positive/', self.loss_log_path + 'negative/'
        ]
        utils.check_dir(path_list)

        print('======================================================')
        print('Start Training PrejudgeBinary...')

        if load_pickle:

            # Load era_sign_test
            if load_pickle_path is None:
                era_sign_test = utils.load_pkl_to_data(
                    self.prejudged_data_path + 'binary_era_sign_test.p')
            else:
                era_sign_test = utils.load_pkl_to_data(load_pickle_path)

        else:

            # Training Era Sign
            era_sign_test = self.predict_era_sign()

            # era_sign_test = self.load_era_sign_csv(self.pred_path + 'pred_era/final_results/lgb_result.csv')

            # Save era_sign_test to Pickle File
            utils.save_data_to_pkl(
                era_sign_test,
                self.prejudged_data_path + 'binary_era_sign_test.p')

        # Print Prediction of Positive Era Rate
        utils.print_positive_rate_test(era_sign_test)

        # Get Split Data
        x_test_p, x_g_test_p, id_test_p, era_idx_test_p, x_test_n, \
            x_g_test_n, id_test_n, era_idx_test_n = self.split_test_set_by_era_sign(era_sign_test)

        # Training Models by Era Sign
        prob_test = \
            self.train_models_by_era_sign(x_test_p, x_g_test_p, id_test_p, era_idx_test_p,
                                          x_test_n, x_g_test_n, id_test_n, era_idx_test_n)

        # Save Predictions
        utils.save_pred_to_csv(self.pred_path + 'final_results/prejudge_',
                               self.id_test, prob_test)

        # Print Prediction of Positive Era Rate
        utils.print_positive_rate_test(era_sign_test)

        total_time = time.time() - start_time
        print('======================================================')
        print('Training Done!')
        print('Total Time: {}s'.format(total_time))
        print('======================================================')
コード例 #11
0
ファイル: preprocess.py プロジェクト: LeanderLXZ/stock
    def save_data(self):

        print('======================================================')
        print('Saving Preprocessed Data...')
        utils.save_data_to_pkl(self.x_train,
                               self.preprocess_path + 'x_train.p')
        utils.save_data_to_pkl(self.y_train,
                               self.preprocess_path + 'y_train.p')
        utils.save_data_to_pkl(self.w_train,
                               self.preprocess_path + 'w_train.p')
        utils.save_data_to_pkl(self.e_train,
                               self.preprocess_path + 'e_train.p')
        utils.save_data_to_pkl(self.y_test, self.preprocess_path + 'y_test.p')
        utils.save_data_to_pkl(self.x_test, self.preprocess_path + 'x_test.p')
        utils.save_data_to_pkl(self.w_test, self.preprocess_path + 'w_test.p')
        utils.save_data_to_pkl(self.e_test, self.preprocess_path + 'e_test.p')
        utils.save_data_to_pkl(self.pct_test,
                               self.preprocess_path + 'pct_test.p')
        utils.save_data_to_pkl(self.id_test,
                               self.preprocess_path + 'id_test.p')

        if group_list is not None:
            utils.save_data_to_pkl(self.x_g_train,
                                   self.preprocess_path + 'x_g_train.p')
            utils.save_data_to_pkl(self.x_g_test,
                                   self.preprocess_path + 'x_g_test.p')
        if self.use_code_id:
            utils.save_data_to_pkl(self.code_id_train,
                                   self.preprocess_path + 'code_id_train.p')
            utils.save_data_to_pkl(self.code_id_test,
                                   self.preprocess_path + 'code_id_test.p')
コード例 #12
0
  def _save_data(self):
    """
    Save data set to pickle files.
    """
    utils.thin_line()
    print('Saving pickle files...')

    utils.check_dir([self.preprocessed_path])
    
    utils.save_data_to_pkl(
        self.x_train, join(self.preprocessed_path, 'x_train.p'))
    utils.save_data_to_pkl(
        self.y_train, join(self.preprocessed_path, 'y_train.p'))
    utils.save_data_to_pkl(
        self.x_valid, join(self.preprocessed_path, 'x_valid.p'))
    utils.save_data_to_pkl(
        self.y_valid, join(self.preprocessed_path, 'y_valid.p'))
    utils.save_data_to_pkl(
        self.x_test, join(self.preprocessed_path, 'x_test.p'))
    utils.save_data_to_pkl(
        self.y_test, join(self.preprocessed_path, 'y_test.p'))
コード例 #13
0
ファイル: preprocess.py プロジェクト: leiseraiesecqd/ccf
    def save_data(self):

        print('======================================================')
        print('Saving Preprocessed Data...')
        utils.save_data_to_pkl(self.x_train,
                               self.preprocess_path + 'x_train.p')
        utils.save_data_to_pkl(self.x_g_train,
                               self.preprocess_path + 'x_g_train.p')
        utils.save_data_to_pkl(self.y_train,
                               self.preprocess_path + 'y_train.p')
        utils.save_data_to_pkl(self.x_test, self.preprocess_path + 'x_test.p')
        utils.save_data_to_pkl(self.x_g_test,
                               self.preprocess_path + 'x_g_test.p')
        utils.save_data_to_pkl(self.id_test,
                               self.preprocess_path + 'id_test.p')
コード例 #14
0
ファイル: preprocess.py プロジェクト: LeanderLXZ/stock
    def save_data_by_era_distribution_pd(self):

        print('======================================================')
        print('Saving Preprocessed Data Split by Era Distribution...')

        # Positive Data
        print('Saving Positive Data...')
        utils.save_data_to_pkl(self.x_train_p,
                               self.preprocess_path + 'x_train_p.p')
        utils.save_data_to_pkl(self.y_train_p,
                               self.preprocess_path + 'y_train_p.p')
        utils.save_data_to_pkl(self.w_train_p,
                               self.preprocess_path + 'w_train_p.p')
        utils.save_data_to_pkl(self.e_train_p,
                               self.preprocess_path + 'e_train_p.p')

        # Negative Data
        print('Saving Negative Data...')
        utils.save_data_to_pkl(self.x_train_n,
                               self.preprocess_path + 'x_train_n.p')
        utils.save_data_to_pkl(self.y_train_n,
                               self.preprocess_path + 'y_train_n.p')
        utils.save_data_to_pkl(self.w_train_n,
                               self.preprocess_path + 'w_train_n.p')
        utils.save_data_to_pkl(self.e_train_n,
                               self.preprocess_path + 'e_train_n.p')

        if group_list is not None:
            utils.save_data_to_pkl(self.x_g_train_p,
                                   self.preprocess_path + 'x_g_train_p.p')
            utils.save_data_to_pkl(self.x_g_train_n,
                                   self.preprocess_path + 'x_g_train_n.p')
コード例 #15
0
ファイル: preprocess.py プロジェクト: LeanderLXZ/stock
    def save_global_valid_set(self):

        print('======================================================')
        print('Saving Validation Set...')
        utils.save_data_to_pkl(self.x_valid,
                               self.preprocess_path + 'x_global_valid.p')
        utils.save_data_to_pkl(self.y_valid,
                               self.preprocess_path + 'y_global_valid.p')
        utils.save_data_to_pkl(self.w_valid,
                               self.preprocess_path + 'w_global_valid.p')
        utils.save_data_to_pkl(self.e_valid,
                               self.preprocess_path + 'e_global_valid.p')

        if group_list is not None:
            utils.save_data_to_pkl(self.x_g_valid,
                                   self.preprocess_path + 'x_g_global_valid.p')
        if self.use_code_id:
            utils.save_data_to_pkl(
                self.code_id_valid,
                self.preprocess_path + 'code_id_global_valid.p')
コード例 #16
0
    def save_data(self):

        print('Saving Preprocessed Data...')

        utils.save_data_to_pkl(self.x_train, self.preprocessed_path + 'x_train_gan.p')
        utils.save_data_to_pkl(self.x_test, self.preprocessed_path + 'x_test_gan.p')
コード例 #17
0
    def train(self, similarity_prob_path=None, global_epochs=1, return_similarity_prob=False):
        """
            Train the GAN
        """
        print('======================================================')
        print('Training GAN for Adversarial Validation Set...')
        print('------------------------------------------------------')

        # Build Network
        tf.reset_default_graph()
        train_graph = tf.Graph()

        with train_graph.as_default():

            # Get inputs
            inputs_real, inputs_z, keep_prob = self.model_inputs()

            # Get losses
            d_loss, g_loss = self.model_loss(inputs_real, inputs_z, keep_prob)

            # Get optimizers
            d_train_opt, g_train_opt = self.model_opt(d_loss, g_loss)

            # Get similarities
            similarities = self.get_similarity(inputs_real, keep_prob)

            # Get generator
            g_outputs = self.get_generator(inputs_z, keep_prob)

        batch_counter = 0
        similarity_prob_total = []

        with tf.Session(graph=train_graph) as sess:

            local_start_time = time.time()

            for global_epoch_i in range(global_epochs):

                print('======================================================')
                print('Training on Global Epoch: {}/{}'.format(global_epoch_i+1, global_epochs))
                print('------------------------------------------------------')

                x_test = self.x_test
                np.random.shuffle(x_test)

                sess.run(tf.global_variables_initializer())

                for epoch_i in range(self.epochs):

                    for batch_i, x_batch in enumerate(self.get_batches(x_test, self.batch_size)):

                        batch_counter += 1

                        # Sample random noise
                        batch_z = np.random.uniform(0, 1, size=(self.batch_size, self.z_dim))

                        # Run optimizers
                        for _ in range(self.d_epochs):
                            sess.run(d_train_opt, feed_dict={inputs_real: x_batch,
                                                             inputs_z: batch_z,
                                                             keep_prob: self.keep_prob})
                        for _ in range(self.g_epochs):
                            sess.run(g_train_opt, feed_dict={inputs_real: x_batch,
                                                             inputs_z: batch_z,
                                                             keep_prob: self.keep_prob})

                        if batch_counter % self.display_step == 0 and batch_i > 0:

                            # At losses
                            d_cost = d_loss.eval({inputs_real: x_batch, inputs_z: batch_z, keep_prob: 1.0})
                            g_cost = g_loss.eval({inputs_z: batch_z, keep_prob: 1.0})

                            total_time = time.time() - local_start_time

                            print('Global_Epoch: {}/{} |'.format(global_epoch_i+1, global_epochs),
                                  'Epoch: {}/{} |'.format(epoch_i + 1, self.epochs),
                                  'Batch: {:>5} |'.format(batch_counter),
                                  'Time: {:>3.2f}s |'.format(total_time),
                                  'd_Loss: {:.8f} |'.format(d_cost),
                                  'g_Loss: {:.8f}'.format(g_cost))

                        if batch_counter % self.show_step == 0 and batch_i > 0:

                            example_z = np.random.uniform(0, 1, size=(self.batch_size, self.z_dim))

                            # At losses
                            generator_outputs = sess.run(g_outputs, feed_dict={inputs_z: example_z, keep_prob: 1.0})
                            g_similarity_prob = \
                                sess.run(similarities, feed_dict={inputs_real: generator_outputs, keep_prob: 1.0})
                            t_similarity_prob = \
                                sess.run(similarities, feed_dict={inputs_real: self.x_train, keep_prob: 1.0})

                            print('------------------------------------------------------')
                            print('Generator Outputs:\n', generator_outputs[0])
                            print('------------------------------------------------------')
                            print('Similarity Prob of Generator Outputs:\n', g_similarity_prob[:50].reshape(1, -1))
                            print('------------------------------------------------------')
                            print('Similarity Prob of Train Set:\n', t_similarity_prob[:50].reshape(1, -1))
                            print('------------------------------------------------------')
                            
                print('------------------------------------------------------')
                print('Calculating Similarities of Train Set...')
                similarity_prob = \
                    sess.run(similarities, feed_dict={inputs_real: self.x_train, keep_prob: 1.0})

                similarity_prob_total.append(similarity_prob)

            print('======================================================')
            print('Calculating Final Similarities of Train Set...')
            similarity_prob_mean = np.mean(np.array(similarity_prob_total), axis=0)

            utils.save_data_to_pkl(similarity_prob_mean, similarity_prob_path + 'similarity_prob.p')

            if return_similarity_prob:
                return similarity_prob_mean
コード例 #18
0
ファイル: preprocess.py プロジェクト: LeanderLXZ/stock
    def split_data_by_gan(self,
                          load_pickle=True,
                          sample_ratio=None,
                          sample_by_era=True,
                          generate_mode='valid'):

        print('======================================================')
        print('Splitting Adversarial Validation Set by GAN...')

        if load_pickle:
            similarity_prob = utils.load_pkl_to_data(cfg.gan_prob_path +
                                                     'similarity_prob.p')
        else:
            similarity_prob = \
                GenerateValidation.train(train_path=cfg.train_csv_path, test_path=cfg.test_csv_path, global_epochs=1,
                                         similarity_prob_path=cfg.gan_prob_path, return_similarity_prob=True,
                                         load_preprocessed_data=True)

        valid_idx = []
        train_idx = []

        if sample_by_era:

            similarity_prob_e = []
            index_e = []
            similarity_prob_all = []
            index_all = []
            era_tag = 0
            era_all = [era_tag]

            for idx, era in enumerate(self.e_train):

                if idx == len(self.e_train) - 1:
                    similarity_prob_e.append(similarity_prob[idx])
                    index_e.append(idx)
                    similarity_prob_all.append(similarity_prob_e)
                    index_all.append(index_e)
                elif era_tag == era:
                    similarity_prob_e.append(similarity_prob[idx])
                    index_e.append(idx)
                else:
                    era_tag = era
                    era_all.append(era)
                    similarity_prob_all.append(similarity_prob_e)
                    index_all.append(index_e)
                    similarity_prob_e = [similarity_prob[idx]]
                    index_e = [idx]

            for e, similarity_prob_e in enumerate(similarity_prob_all):

                n_sample_e = int(len(similarity_prob_e) * sample_ratio)
                most_similar_idx_e = np.argsort(
                    similarity_prob_e)[:, :-(n_sample_e + 1):-1]
                least_similar_idx_e = np.argsort(
                    similarity_prob_e)[:, :len(similarity_prob_e) - n_sample_e]

                if generate_mode == 'valid':
                    valid_idx += list(index_all[e][most_similar_idx_e])
                    train_idx += list(index_all[e][least_similar_idx_e])
                elif generate_mode == 'train':
                    train_idx += list(index_all[e][most_similar_idx_e])
                    valid_idx += list(index_all[e][least_similar_idx_e])
                else:
                    raise ValueError("Wrong 'generate_mode'!")
        else:

            n_sample = int(len(similarity_prob) * sample_ratio)
            most_similar_idx = np.argsort(similarity_prob)[:, :-(n_sample +
                                                                 1):-1]
            least_similar_idx = np.argsort(
                similarity_prob)[:, :len(similarity_prob) - n_sample]

            if generate_mode == 'valid':
                valid_idx = most_similar_idx
                train_idx = least_similar_idx
            elif generate_mode == 'train':
                train_idx = least_similar_idx
                valid_idx = most_similar_idx
            else:
                raise ValueError("Wrong 'generate_mode'!")

        # Generate Validation Set
        self.x_valid = self.x_train[valid_idx]
        self.y_valid = self.x_train[valid_idx]

        # Generate Training Set
        self.x_train = self.x_train[train_idx]
        self.y_train = self.y_train[train_idx]
        self.w_train = self.w_train[train_idx]
        self.e_train = self.e_train[train_idx]

        if group_list is not None:
            self.x_g_valid = self.x_g_train[valid_idx]
            self.x_g_train = self.x_g_train[train_idx]

        # Save Adversarial Validation Set
        print('Saving Adversarial Validation Set...')
        utils.save_data_to_pkl(self.x_valid,
                               self.preprocess_path + 'x_valid.p')
        utils.save_data_to_pkl(self.x_g_valid,
                               self.preprocess_path + 'x_g_valid.p')
        utils.save_data_to_pkl(self.y_valid,
                               self.preprocess_path + 'y_valid.p')