예제 #1
0
 def _show_data_info(self, train_diff_x, val_diff_x):
     if val_diff_x:
         info(color_msg("train_x size is {} and val_x size is {}".format(len(train_diff_x),
                                                                         len(val_diff_x)), color='green'))
     else:
         info(color_msg("train_x size is {} and val_x size is {}".format(len(train_diff_x), 0), color='green'))
     info(color_msg("Note: current full val_x size is {}".format(self.val_y.shape[0]), color='green'))
예제 #2
0
    def output_first_stage_result(self):
        result = self.model.predict(self.x_test,
                                    batch_size=self.batch_size * 16)
        self.cur_model_test_res.append(result)
        if self.train_model_id == 0 and not self.cur_model_train_start:
            info(color_msg("finish current model train", color='yellow'))
            # self.test_result_list[0] = np.mean(self.cur_model_test_res, axis=0)
            self.test_result_list[0] = self.cur_model_test_res[-1]

        if self.train_model_id >= 1:
            result = self._update_multi_model_result(result)

        if self.call_num == self.start_first_stage_call_num:
            info(color_msg("std is {}".format(np.std([np.max(self.best_svm_scores), self.best_auc]))))
            if self.is_best:
                if np.std([np.max(self.best_svm_scores), self.best_auc]) < 0.008:
                    self.svm_test_result.append(result)
                    result = np.mean(self.svm_test_result, axis=0)
                else:
                    result = result
            else:
                if np.std([np.max(self.best_svm_scores), self.best_auc]) < 0.02:
                    self.svm_test_result.append(result)
                    result = np.mean(self.svm_test_result, axis=0)
                    self.svm_test_result.pop(-1)
                else:
                    result = self.svm_test_result[-1]

        if self.multi_label:
            if self.train_model_id >= 1:
                result = self._update_multi_model_result(result)
            else:
                result = self.output_first_stage_result_with_svm(result)

        return result
예제 #3
0
 def _show_runtime_info(self):
     info(color_msg("********************************************************"))
     info(color_msg("current model_id is {}, model_name is {}".format(self.model_id, self.model_lib[self.model_id])))
     info(color_msg(
         "current feature_id is {}, feature_name is {}".format(self.feature_id, self.feature_lib[self.feature_id])))
     info(color_msg("train_model_id is {}".format(self.train_model_id)))
     info(color_msg("********************************************************\n"))
예제 #4
0
    def update_model_weights(self, model, train_epoch, is_svm=False):
        info(
            color_msg(
                "train_epoch is {}, cur model: model_weight_list is {}\n".
                format(train_epoch, len(self.model_weights_list))))
        if self.model_weights_list:
            if self.cur_val_auc > self.best_auc:
                info(color_msg("Update best result!"))
                self.best_auc = self.cur_val_auc
                self.is_best = True
                self.best_call_num = train_epoch

            else:
                self.is_best = False
                model.set_weights(self.model_weights_list[self.best_call_num])

        else:  # 新增第一个模型权值
            self.is_best = True
            self.best_auc = self.cur_val_auc
            self.best_call_num = train_epoch
        if is_svm:
            pass
        else:
            model_weights = model.get_weights()
            self.model_weights_list.append(model_weights)
예제 #5
0
    def test(self, x_test, remaining_time_budget):
        if self.call_num == 0:
            self.x_test_raw = x_test
            self.x_test_clean = self.feature_generator.preprocess_data(self.x_test_raw)

        # if self.metadata["language"] == "ZH" and self.call_num == 1:
        if self.metadata["language"] == "ZH" and self.call_num == 2:
            # feature.do_seg 已经更新
            self.x_test_clean = self.feature_generator.preprocess_data(self.x_test_raw)

        self.x_test = self.transform_test()

        # 输出svm 结果
        if self.call_num < self.start_first_stage_call_num:
            result = self.output_svm_result()

        elif self.second_stage_done:
            result = self.output_third_stage_result()

        elif self.first_stage_done:
            if self.multi_label:
                if self.multi_label_cnt_thred<0:
                    result = self.cur_model_test_res[-1]
                    return result
                else:
                    result = self.svm_test_result[-1]
            else:
                result = self.output_second_stage_result()

        else:
            ## 当前为NN模型输出结果
            result = self.output_first_stage_result()

        if self.first_stage_done:
            info(color_msg("check test_result_list 0:{}".format(self.test_result_list[0]),
                           color='yellow'))

        else:
            print(self.train_model_id)
            info(color_msg("check test_result_list:{}".format(self.test_result_list[self.train_model_id]),
                           color='yellow'))

        self.done_training = False
        self.call_num += 1
        if self.call_num == MAX_EPOCH_NUM:
            self.done_training = True
            self.ft_model = None
        return result
예제 #6
0
    def run_first_stage_model(self, train_preprocessed_data, train_diff_y):
        if self.switch_new_model and not self.cur_model_train_start:  # 如果切换模型,且当前模型没有开始训练
            self._clear_train_space()
            self._init_nn_train_process()
            self.model = self.model_manager.select_classifier(model_name=self.model_lib[self.model_id],
                                                              feature_mode=self.feature_lib[self.feature_id],
                                                              data_feature=self.feature_generator.data_feature)
            info(color_msg("start new nn model training!"))

            if self.model_lib[self.model_id] == "text_cnn":
                if self.imbalance_level == 2 or self.metadata["class_num"] >= 5:
                    self.callbacks = []
                else:
                    self.callbacks = [self.callbacks_[0]]
            else:
                self.callbacks = [self.callbacks_[1]]

        self._train_nn_process(train_preprocessed_data, train_diff_y)

        if self.train_model_id >= 1:  # 训练了至少2个模型
            self._ensemble_multi_models()
        else:
            self._ensemble_multi_models()

        #  达到结束的条件
        if self.evaluator.decide_stop(train_epoch=self.train_epoch):
            self._reset_train_status()
예제 #7
0
    def run_svm(self, model_name, train_x, train_y):
        self.feature_generator.tokenizer = None
        # if self.metadata["language"] == "ZH" and self.call_num == 0:
        if self.metadata["language"] == "ZH" and self.call_num <= 1:
            analyzer = "char"
        else:
            analyzer = "word"
        self.feature_generator.build_tokenizer(train_x, 'svm', analyzer)
        # 后处理,将文本数据转换为 fidf feature
        if len(train_x) > MAX_SVM_FIT_NUM:
            info("Redo sample size limitation for SVM! Use up to 20000 samples")
            train_x = train_x[:MAX_SVM_FIT_NUM]
            train_y = train_y[:MAX_SVM_FIT_NUM, :]

        train_data = self.feature_generator.postprocess_data(train_x)
        info(color_msg("model type is {}, train_feature.shape:{}".format('svm', train_data.shape)))
        classifier = self.model_manager.select_classifier(model_name=model_name, feature_mode=None,
                                                          data_feature=self.feature_generator.data_feature)

        self.svm_token = self.feature_generator.tokenizer
        if self.multi_label:
            classifier.fit(train_data, train_y)
        else:
            classifier.fit(train_data, ohe2cat(train_y))
        return classifier
예제 #8
0
    def output_third_stage_result(self):
        info(color_msg("First stage finish! Output Bert Result!!"))
        if self.use_pretrain_model:
            if self.update_bert:  # 如果更新了bert模型,采用更新的参数进行预测
                # self.ft_model.
                result = self.ft_model.model_predict_process(self.x_test_clean, self.ft_model.model)
                self.best_bert_pred = result
                self.bert_result.append(result)
                if len(self.bert_result) > 0:  # ensemble前N次 bert结果
                    result = np.mean(self.bert_result, axis=0)

            else:  # 否则,用历史结果出点
                result = np.mean(self.bert_result, axis=0)

        else:
            if self.bert_auc * 0.98 > self.best_auc:  # 表明已经存在训练过的bert模型且结果远高于前两阶段
                result = np.mean(self.bert_result, axis=0)
            elif self.bert_auc > 0.0:  # 表面已存在训练过的bert模型,但结果没有远超过前两阶段
                if isinstance(self.test_result_list[3], int):

                    result = np.mean(self.test_result_list[:3], axis=0)
                else:
                    result = np.mean(self.test_result_list[:3], axis=0)
            else:  # 表面当前只有CNN模型
                result = np.mean(self.test_result_list[:3], axis=0)
        return result
예제 #9
0
 def decide_stop(self, train_epoch):
     self.update_early_stop_params()
     self.val_auc_list.append(self.cur_val_auc)
     self.stop_criteria = self.check_early_stop_criteria(train_epoch)
     info(
         color_msg("Note: stop condition is {}".format(self.stop_criteria),
                   color='blue'))
     return self.stop_criteria
예제 #10
0
 def lr_decay(self, epoch):
     if self.call_num == 1 or self.cur_lr is None:
         self.cur_lr = self.model_manager.lr
     if self.train_epoch % 3 == 0 and self.train_epoch > 0:
         self.cur_lr = 3 * self.cur_lr / 5
     self.cur_lr = max(self.cur_lr, 0.0001)
     info(color_msg("recompile lr {}".format(self.cur_lr), color="blue"))
     lr = self.cur_lr
     return lr
예제 #11
0
    def _train_bert_process(self, train_x, train_y):
        if self.bert_check_length <= 64:
            self.ft_model.finetune_config.max_seq_length = 64
        else:
            self.ft_model.finetune_config.max_seq_length = 128

        if not self.start_ft_bert:
            self.ft_model.finetune_config.num_train_epochs = 2
            self.ft_model.finetune_config.per_gpu_train_batch_size = 16
            self.ft_model.finetune_config.warmup_steps = int(0.1 * (3000 // 16 * 2))
            if self.ft_model.finetune_config.max_seq_length==128:
                self.ft_model.finetune_config.learning_rate = 2e-5
                self.ft_model.finetune_config.per_gpu_train_batch_size = 8
                self.ft_model.finetune_config.warmup_steps = int(0.05 * (3000 // 8 * 2))

            gc.collect()
            K.clear_session()
            #
            self.db_model = None
            self.start_ft_bert = True
            self.start_ft_bert_call_num = self.call_num
            info("start_ft_bert_call_num is {}".format(self.start_ft_bert_call_num))

        if self.call_num >= self.start_ft_bert_call_num + 2:  #
            self.ft_model.finetune_config.learning_rate = max(3 * self.ft_model.finetune_config.learning_rate / 5, 1e-5)
            self.ft_model.finetune_config.num_train_epochs = 1
            self.ft_model.finetune_config.warmup_steps = 0

        if len(train_x) > MAX_BERT_FIT_NUM:
            max_num = int(float(MAX_BERT_FIT_NUM) / float(self.metadata["class_num"]))
            _x_train, _y_train = downsampling_input_data(train_x, train_y, self.metadata["class_num"],
                                                         max_sample_num=max_num)
        else:
            _x_train, _y_train = train_x, train_y

        ft_model = self.ft_model.train_model_process(_x_train, ohe2cat(_y_train), self.ft_model.model)
        y_eval = self.ft_model.model_eval_process(self.clean_val_x, ohe2cat(self.val_y), ft_model)
        bert_auc = ATEvaluator.autodl_auc(solution=self.val_y, prediction=y_eval)
        info(color_msg("bert_auc is {} and best bert_auc is {}".format(bert_auc, self.bert_auc)))
        if bert_auc > self.bert_auc:
            info("update bert ft model!\n ")
            # 仅考虑连续auc不上升的case,当auc出现更优结果,又重新计算patience
            self.bert_output_patience = 3
            self.update_bert = True
            self.bert_auc = bert_auc
        else:
            self.bert_output_patience -= 1
            self.update_bert = False

        if self.bert_auc > self.best_auc:
            self.use_pretrain_model = True
            self.selcet_svm = False
            return
        else:
            info("update: model save and reload!")
            self.use_pretrain_model = False
            return
예제 #12
0
    def output_second_stage_result(self):
        info("Output in second stage!")
        # 第二阶段没有结束:只有两个选择:second_stage 模型 or 第一阶段最优模型
        if self.use_second_stage_model:
            self.second_stage_patience = 0
            info(color_msg("Use second_stage Model!!"))
            second_stage_result = self.second_stage_model.test(self.x_test_raw)

            # info(color_msg("second_stage result is {}".format(type(second_stage_result))))
            # if isinstance(second_stage_result, list):
            #     info(color_msg("second_stage result is {}".format(len(second_stage_result))))
            # if isinstance(second_stage_result, np.ndarray):
            #     info(color_msg("second_stage result is {}".format(second_stage_result.shape[0])))
            # if isinstance(second_stage_result, np.float):
            #     info(color_msg("second_stage result is {}".format(second_stage_result)))

            # 如果second_stage输出为空,返回第一个阶段结果
            if second_stage_result.shape[0] == 0:
                if isinstance(self.test_result_list[2], int):
                    result = np.mean(self.test_result_list[:2], axis=0)
                else:
                    result = np.mean(self.test_result_list[:3], axis=0)
                return result
            else:
                self.test_result_list[2] = second_stage_result
                result = np.mean(self.test_result_list[:3], axis=0)
                return result
        else:
            info(
                color_msg(
                    "Do Not Use second_stage Model!! second_stage_patience is {}"
                    .format(self.second_stage_patience)))
            self.second_stage_patience += 1
            if self.start_second_stage_model:
                if isinstance(self.test_result_list[2], int):
                    result = np.mean(self.test_result_list[:2], axis=0)
                else:
                    result = np.mean(self.test_result_list[:3], axis=0)
            else:
                if self.train_model_id == 0:
                    result = self.test_result_list[0]
                else:
                    result = np.mean(self.test_result_list[:2], axis=0)
            return result
예제 #13
0
    def set_tokenizer(self, dat, tokenizer_type):
        if tokenizer_type == "svm":
            self.tokenizer = build_tokenizer(dat, tokenizer_type,
                                             **self.tokenizer_conf)

        elif tokenizer_type == 'nn':
            self.set_max_seq_len()
            self.set_max_vocab_size(dat)
            self.tokenizer_conf['num_words'] = self.vocab_size
            info(
                color_msg("vocab_size:{}".format(self.vocab_size),
                          color='blue'))
            self.tokenizer_conf['pad_max_length'] = self.max_length
            self.tokenizer = build_tokenizer(dat, tokenizer_type,
                                             **self.tokenizer_conf)
            self.word_index = self.tokenizer.word_index
            self.num_features = min(len(self.word_index) + 1, self.vocab_size)
            info(
                color_msg("num_features:{}".format(self.num_features),
                          color='blue'))
예제 #14
0
 def _update_multi_model_result(self, pred):
     if self.is_best:
         self.test_result_list[self.train_model_id] = pred
         result = np.mean(self.test_result_list[:self.train_model_id + 1], axis=0)
     else:
         info(color_msg("do not ensemble cur model!!!!\n"))
         if isinstance(self.test_result_list[self.train_model_id], int):
             result = self.test_result_list[0]
         else:
             result = np.mean(self.test_result_list[:self.train_model_id + 1], axis=0)
     return result
예제 #15
0
 def get_sampling_data_frm_full_train(self):
     """
     从全局的train data中采样,只看当前的 meta_train_x, meta_train_y
     :return:
     """
     sample_index = get_sample_index(self.meta_train_y, self.num_classes)
     train_label_distribution = np.sum(np.array(self.meta_train_y), 0)
     info(color_msg("before sampling--train_distribution: {}".format(train_label_distribution),
                    color='yellow'))  # 获取对应label的分布
     self.balance_sampling_index(sample_index, train_label_distribution)
     # 每次只看当前需要采样的数据是否均衡,是否需要生成伪样本
     self.normal_std, self.empty_class_ = get_imbalance_statistic(train_label_distribution)
     self.check_imbalance_level(train_label_distribution)
     self.new_generate_samples_idx = self.generate_presudo_samples(sample_index)
     self.show_data_info()
     self.imbalance_flg = False
     train_x, train_y = self.extend_train_data(x=self.meta_train_x, y=self.meta_train_y)
     train_label_distribution = np.sum(np.array(train_y), 0)
     info(color_msg("after sampling--train_distribution: {}".format(train_label_distribution),
                    color='yellow'))  # 获取对应label的分布
     return train_x, train_y
예제 #16
0
 def step_decay(self, epoch):
     epoch = self.train_epoch // 3
     initial_lrate = self.model_manager.lr  # 0.016 #0.0035 #
     drop = 0.65  # 0.65
     epochs_drop = 1.0  # 2.0
     if (self.train_epoch) <= 2:
         lrate = initial_lrate
     else:
         lrate = initial_lrate * math.pow(drop, math.floor((1 + epoch) / epochs_drop))
     lrate = max(lrate, 0.0001)
     info(color_msg("recompile lr {}".format(lrate), color="blue"))
     return lrate
예제 #17
0
    def _set_sampling_strategy(self, y_train):
        strategy = sample_strategy['sample_iter_incremental_no_train_split']

        if y_train.shape[0] > 0:  # 如果当前有增量数据进样
            if self.call_num == 0 or self.call_num >= self.start_cnn_call_num:
                strategy = sample_strategy['sample_iter_incremental_no_train_split']
                info(color_msg("strategy is sample_iter_incremental_no_train_split"))

            elif self.call_num < self.start_cnn_call_num:
                strategy = sample_strategy["sample_iter_incremental_with_train_split"]
                info(color_msg("strategy is sample_iter_incremental_with_train_split"))

            if self.start_cnn_call_num == self.imbalance_flow_control and not self.split_val_x:
                strategy = sample_strategy["sample_from_full_data"]
                info(color_msg("strategy is sample_from_full_data"))

        else:  # 当前已无增量数据
            if self.val_y.shape[0] > 0:  # 如果已经有val数据集
                strategy = sample_strategy["sample_from_full_train_data"]
                info(color_msg("strategy is sample_from_full_train_data"))
            else:
                strategy = sample_strategy["sample_from_full_data"]
                info(color_msg("strategy is sample_from_full_data"))

        info(color_msg("call num is {}: add_val_to_train is {}, "
                       "update_train is {}, use_full is {}".format(self.call_num,
                                                                   strategy["add_val_to_train"],
                                                                   strategy["update_train"],
                                                                   strategy["use_full"])))
        return strategy
예제 #18
0
    def valid_auc(self, is_svm=False, model=None, use_autodl_auc=True):
        if is_svm:
            x_valid = self.tokenizer.transform(self.x)

            result = model.predict_proba(x_valid)
            result = self.rebuild_predict_prob(result)
            info(color_msg("self.x shape {}".format(self.label.shape)))
            info(color_msg("result shape {}".format(result.shape)))
        else:

            result = model.predict_generator(self.eval_generator)

        if use_autodl_auc:
            self.cur_val_auc = ATEvaluator.autodl_auc(solution=self.label,
                                                      prediction=result)
        else:
            self.cur_val_auc = ATEvaluator.auc_metric(solution=self.label,
                                                      prediction=result)

        info(
            color_msg("Note: cur_val_auc is {}".format(self.cur_val_auc),
                      color='blue'))
예제 #19
0
 def check_early_stop_criteria(self, train_epoch):
     # db方案早停条件: 出现k次低于最佳auc的情况: 针对模型比较震荡
     early_stop_criteria_1 = self.k >= self.patience or train_epoch > self.max_epoch
     # upwind 方案早停条件1: 当前评估auc足够高且训练次数足够大,出现一次下降即停:针对模型后期,避免下降
     early_stop_criteria_2 = self.cur_val_auc < self.last_val_auc and self.cur_val_auc > 0.96 and train_epoch > self.max_epoch
     # upwind 方案早停条件2: 当前训练次数达到阈值,且连续下降次数达到阈值即停:针对模型难收敛/过拟合
     early_stop_criteria_3 = train_epoch >= 5 and self.k >= 2
     info(
         color_msg("stop criteria 1 is {}, 2 is {}, 3 is {}".format(
             early_stop_criteria_1, early_stop_criteria_2,
             early_stop_criteria_2),
                   color='blue'))
     return (early_stop_criteria_1 or early_stop_criteria_2
             or early_stop_criteria_3)
예제 #20
0
 def output_first_stage_result_with_svm(self, result):
     if self.is_best:  # 包括svm 结果
         info(color_msg("std is {}".format(np.std([np.max(self.best_svm_scores), self.best_auc]))))
         if np.std([np.max(self.best_svm_scores), self.best_auc]) < 0.005:
             self.svm_test_result.append(result)
             result = np.mean(self.svm_test_result, axis=0)
             # self.svm_test_result.pop(-1)
         else:
             self.multi_label_cnt_thred-=1
             result = result
     else:
         if np.std([np.max(self.best_svm_scores), self.best_auc]) < 0.02:
             self.svm_test_result.append(result)
             result = np.mean(self.svm_test_result, axis=0)
             self.svm_test_result.pop(-1)
         else:
             result = self.svm_test_result[-1]
     return result
예제 #21
0
    def prepare_clean_data(self, train_diff_x, val_diff_x, val_diff_y):
        # 前处理:根据给定的前处理方式 清洗文本数据, 先默认default
        train_preprocessed_data = self.feature_generator.preprocess_data(train_diff_x)
        self.process_val_data(val_diff_x, val_diff_y)
        if self.call_num == 2 and self.metadata["language"] == "ZH":
            info(color_msg("do tokenization for val data!"))
            self.clean_val_x = list(map(_tokenize_chinese_words, self.clean_val_x))

        if self.accu_nn_tokenizer_x:
            # if self.call_num == 1 and self.metadata["language"] == "ZH":  # 前2次都不用
            if self.call_num == 2 and self.metadata["language"] == "ZH":
                # 不用ZH第一次的数据(char-level)
                self.build_tokenizer_x = train_preprocessed_data
            else:

                self.build_tokenizer_x = self.build_tokenizer_x + train_preprocessed_data

        return train_preprocessed_data
예제 #22
0
    def sample_dataset_pipeline(self, use_val=False, update_train=True, data_x=None, data_y=None):
        """
        全局采样pipeline
        :param use_val: 是否采用val数据
        :param update_train: 是否更新全量train
        :param data_x: 采样数据来源x:增量数据或者全量原始数据
        :param data_y: 采样数据来源y:增量数据或者全量原始数据
        :return: 均衡采样后的训练集/评估集,use_val为True时,评估集为空
        """
        val_diff_x, val_diff_y = None, None
        ############################ 采样准备阶段 ###################################
        if update_train:
            # 增量更新(第一次样本即增量)
            self.add_index, self.add_val_index = self.sample_val_index(data_y)

            val_diff_x, val_diff_y = map_x_y(self.add_val_index, data_x, data_y)
            # 此时的训练集没有进行采样
            train_diff_x, train_diff_y = flat_map_x_y(index=self.add_index, x=data_x, y=data_y)

            if use_val:  # 如果采用val,即当前不分train valid,全部数据更新meta_train
                info(color_msg(msg="use val is True", color='blue'))
                train_diff_x = train_diff_x + val_diff_x
                train_diff_y = np.concatenate([train_diff_y, val_diff_y], axis=0)
                val_diff_x = None
                val_diff_y = None

            self._update_train_meta(train_diff_x, train_diff_y)

        if val_diff_x:
            val_label_distribution = np.sum(np.array(val_diff_y), 0)
            info("val_distribution: {}".format(val_label_distribution))
            info("Check val_diff_x size {}, val_diff_y size {}".format(len(val_diff_x),
                                                                       val_diff_y.shape[0]))

        info("Check meta_train_x size {}, meta_train_y size {}".format(len(self.meta_train_x),
                                                                       self.meta_train_y.shape[0]))
        info("Check meta_data_x size {}, meta_data_y size {}".format(len(self.meta_data_x),
                                                                     self.meta_data_y.shape[0]))

        ############################ 进入采样阶段 ###################################
        train_x, train_y = self.get_sampling_data_frm_full_train()
        return train_x, train_y, val_diff_x, val_diff_y
예제 #23
0
    def _ensemble_multi_models(self):
        info(color_msg("cur best_auc is {} and val_auc is {}".format(self.best_auc, self.evaluator.best_auc),
                       color='yellow'))
        # 对于SVM及第一个NN模型结果,要求比前一次评估高
        if self.call_num <= self.start_first_stage_call_num:
            ensemble_condition = self.evaluator.best_auc > self.best_auc
        else:
            # 对于第二个NN模型结果,要求达到前一次NN模型效果的97%
            ensemble_condition = self.evaluator.best_auc > 0.97 * self.best_auc

        if ensemble_condition:
            self.is_best = True  # 允许多模型融合
            self.best_auc = max(self.evaluator.best_auc, self.best_auc)

        else:
            self.is_best = False

        self.best_cnt.append(self.is_best)

        if self.call_num < self.start_first_stage_call_num:
            self.best_svm = self.best_auc
            self.best_svm_scores.append(self.best_svm)

        self._show_ensemble_info()
예제 #24
0
 def is_stage_done(self):
     info(color_msg("check stage: model id is {}, feature_id is {}".format(self.model_id, self.feature_id)))
     if self.model_id == len(self.model_lib) - 1:
         if self.feature_id == len(self.feature_lib) - 1:
             self.first_stage_done = True
예제 #25
0
 def _show_ensemble_info(self):
     if self.is_best:
         info(color_msg("ensemble new model!", color='yellow'))
     else:
         info(color_msg("do not ensemble new model!", color='yellow'))
     info(color_msg("use models : {}".format(self.model_lib)))
예제 #26
0
    def output_second_stage_result(self):
        info("Output in second stage!")
        # 第二阶段没有结束:只有两个选择:db 模型 or 第一阶段最优模型
        if self.use_db_model:
            self.db_patience = 0
            info(color_msg("Use db Model!!"))
            db_result = self.db_model.test(self.x_test_raw)

            info(color_msg("db result is {}".format(type(db_result))))
            if isinstance(db_result, list):
                info(color_msg("db result is {}".format(len(db_result))))
            if isinstance(db_result, np.ndarray):
                info(color_msg("db result is {}".format(db_result.shape[0])))
            if isinstance(db_result, np.float):
                info(color_msg("db result is {}".format(db_result)))

            # 如果db输出为空,返回第一个阶段结果
            if db_result.shape[0] == 0:
                if isinstance(self.test_result_list[2], int):
                    result = np.mean(self.test_result_list[:2], axis=0)
                else:
                    result = np.mean(self.test_result_list[:3], axis=0)
                return result
            else:
                self.test_result_list[2] = db_result
                result = np.mean(self.test_result_list[:3], axis=0)
                return result
        else:
            info(color_msg("Do Not Use db Model!! db_patience is {}".format(self.db_patience)))
            self.db_patience += 1
            if self.start_db_model:
                if isinstance(self.test_result_list[2], int):
                    info(color_msg("Do Not Use db Model and use First Stage Result!!!"))
                    info(color_msg("check test_result_list 2:{}".format(len(self.test_result_list[:2])),
                                   color='yellow'))
                    result = np.mean(self.test_result_list[:2], axis=0)
                else:
                    result = np.mean(self.test_result_list[:3], axis=0)
            else:
                info(color_msg("Start db, but use First Stage Result!"))
                if self.train_model_id == 0:
                    info(color_msg("Check test_Result_list:{}".format(self.test_result_list[1])))
                    info(color_msg("Start db, but use First Stage Result when train_model_id==0"))
                    result = self.test_result_list[0]
                else:
                    info(color_msg("Start db, but use First Stage Result when train_model_id==1"))
                    result = np.mean(self.test_result_list[:2], axis=0)
            return result
예제 #27
0
    def train(self, x_train, y_train, remaining_time_budget=None):
        if self.done_training:
            return
        if not self.use_multi_svm and self.metadata["language"] == "EN":
            self.start_first_stage_call_num = 1
        if not self.use_multi_svm and self.metadata["language"] == "ZH":
            self.start_first_stage_call_num = 2
        if self.use_multi_svm and self.multi_label:
            self.start_first_stage_call_num = 2

        info(color_msg("use multi_svm {} and start_first_stage_call_num is {}".format(self.use_multi_svm,
                                                                                      self.start_first_stage_call_num)))

        if self.call_num == 0:
            self.init_generators(x_train, y_train)
            if self.use_multi_svm:
                self.evaluator.max_epoch = 15
            if self.multi_label:
                self.evaluator.max_epoch = 18

        else:
            # if self.call_num == 1 and self.metadata["language"] == "ZH":
            if self.call_num == 2 and self.metadata["language"] == "ZH":
                self.feature_generator.do_seg = True

            if y_train.shape[0] > 0:
                self.data_manager.update_meta_data(x_train, y_train)

        info(color_msg("check imbalance level {}".format(self.imbalance_level)))

        # 数据采样
        train_diff_x, train_diff_y, val_diff_x, val_diff_y = self.do_data_sampling(y_train)
        self._show_data_info(train_diff_x, val_diff_x)

        ########################## 设定采用的预处理方式###########################################
        # 是否截断
        # 是否分词
        ########################## 数据前处理 ####################################################
        train_preprocessed_data = self.prepare_clean_data(train_diff_x, val_diff_x, val_diff_y)

        ############################ 进入 SVM 阶段模型 ############################
        if self.call_num < self.start_first_stage_call_num:
            self.model = self.run_svm('svm', train_preprocessed_data, train_diff_y)

            if self.val_y.shape[0] > 0:
                self.do_evaluation(eval_svm=True)
                self.evaluator.update_model_weights(self.model, self.train_epoch, is_svm=True)
            self._ensemble_multi_models()

        else:

            if self.call_num == self.start_first_stage_call_num:
                # 进行模型预选择
                self.meta_strategy()
                self.feature_generator.reset_tokenizer()
                # 设定文本长度及文本长度std,影响后处理pad长度设置
                self.feature_generator.max_length = self.max_length
                self.feature_generator.seq_len_std = self.seq_len_std
                self.prepare_nn_tokenizer(train_x=self.build_tokenizer_x)
                self.accu_nn_tokenizer_x = False

            ############################ 进入第三阶段模型 ############################
            if self.second_stage_done and self.use_ft_model:
                if self.start_ft_bert and not self.use_pretrain_model:
                    self.use_pretrain_model = False
                    return
                else:
                    if self.bert_output_patience > 0:
                        return self._train_bert_process(train_preprocessed_data, train_diff_y)
                    else:
                        self.use_pretrain_model = False
                        return

            ############################ 进入第二阶段模型 ############################

            elif self.first_stage_done:
                if not self.multi_label:
                    self.run_second_stage(remaining_time_budget)
                else:
                    info(color_msg("do not run db when multi_label is {}".format(self.multi_label)))
                    self.second_stage_done = False
            ############################ 进入第一阶段模型 ############################
            else:
                if self.switch_new_model and not self.cur_model_train_start:
                    self.is_stage_done()
                if not self.first_stage_done:
                    self.run_first_stage_model(train_preprocessed_data, train_diff_y)

        return
예제 #28
0
    def run_second_stage(self, remaining_time_budget):
        if not self.start_db_model:
            # 第一次进入db时,先清空现有的sess
            self._clear_train_space()
            self.start_db_model = True
            if self.imbalance_level == 2:
                info(color_msg("use bs sampling val for db solution!"))
                self.db_model.split_val = False

        if self.db_model.model_id == len(
                self.db_model.cand_models) and self.db_model.data_id == self.db_model.max_data:
            self.second_stage_done = True
            info("finish second stage!")
            return
        if self.db_patience >= self.db_patience_max_num:
            self.db_model.epoch = 1
            info(color_msg("Change DB model due to low auc!!!!"))
            self.db_patience = 0
            do_clean = True
        else:
            do_clean = False

        if self.db_model.split_val:
            self.db_model.train_iter((self.data_manager.meta_data_x, self.data_manager.meta_data_y),
                                     # eval_dataset=(self.clean_val_x, self.val_y),
                                     eval_dataset=(self.val_x, self.val_y),
                                     remaining_time_budget=remaining_time_budget,
                                     do_clean=do_clean)
        else:
            self.db_model.train_iter((self.data_manager.meta_train_x, self.data_manager.meta_train_y),
                                     # eval_dataset=(self.clean_val_x, self.val_y),
                                     eval_dataset=(self.val_x, self.val_y),
                                     remaining_time_budget=remaining_time_budget,
                                     do_clean=do_clean)

        db_auc = self.db_model.best_sco  # 本身是一个集成结果
        self.evaluator.best_auc = db_auc
        if db_auc == -1 or db_auc == 0.02:
            db_auc = 0.0
        if db_auc >= self.best_auc * 0.97 and db_auc > 0.0:
            info("Use db_model when db_auc is {} and best_val_auc is {}".format(db_auc, self.best_auc))
            self.use_db_model = True
            if self.db_model.Xtest is None and self.db_model.FIRSTROUND:
                info(color_msg("Use START STAGE MODEL"))
                self.db_model.START = True
            elif self.db_model.Xtest is None and self.db_model.new_data:
                info(color_msg("Use NEW DATA STAGE MODEL"))
                # if self.db_model.data_id == 3:
                #     self.db_model.data_id = 2
                self.db_model.START = False
            return
        else:
            info("Do not Use db_model when db_auc is {} and best_val_auc is {}".format(db_auc, self.best_auc))
            # 这里需要保持db model内部的状态,不然会漏状态
            if self.db_model.START == False and self.db_model.FIRSTROUND == False and self.db_model.LASTROUND:
                self.db_model.is_best = False
                self.db_model.LASTROUND = False
            elif self.db_model.START == True:  # 如果db START模型没有超过当前
                self.db_model.START = False

            info("update: model save and reload!")
            self.use_db_model = False
            return