def to_string(self):
     finetune_config_json = {
         # "data_dir": self.data_dir,
         "model_type": self.model_type,
         "model_name_or_path": self.model_name_or_path,
         "task_name": self.task_name,
         "output_dir": self.output_dir
     }
     # self.logger.info("Finetune bert config info = %s" %(finetune_config_json))
     info("Finetune bert config info = %s" %(finetune_config_json))
     return finetune_config_json
def pretrained_models_download():
    if not os.path.exists(en_model_file):
        # os.makedirs(en_model_path)
        info("download en model...")
        os.system("wget -P {} http: // 120.27.216.109:8011/nlp/en_distilroberta/pytorch_model.bin".format(en_model_path))


    if not os.path.exists(zh_model_file):
        info("download zh model...")
        os.system("wget -P {} http://120.27.216.109:8011/nlp/zh_albert_base/pytorch_model.bin".format(zh_model_path))


    pass
    def eval_model_process(self, test_x_data: list):

        results = {}
        checkpoints = [self.finetune_config.output_dir]
        if self.finetune_config.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(self.finetune_config.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            path_abs = os.path.abspath(checkpoint)
            info(path_abs)
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
            model = self.finetune_config.model_class.from_pretrained(checkpoint)
            model.to(self.finetune_config.device)
            result,lr = self.evaluate_model( model, self.finetune_config.tokenizer, prefix=global_step)
            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)
    def model_predict_process(self, test_x_data: list, model):
        """
        :param test_x_data:
        :return: 针对样本预测的label列表.
        """
        # 初始化 test_data_processor.喂进去 test_data, 得到对应 test_processors.
        # test 模式下 test_y_labels 只是桩数据.
        test_y_labels = [1 for i in range(len(test_x_data))]
        self.finetune_config.setup_processors(test_x_data, test_y_labels, data_mode="test")

        # train 的时候,通过 evaluate 字段来控制.
        results = {}
        checkpoints = [self.finetune_config.output_dir]
        if self.finetune_config.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(self.finetune_config.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        # 默认情况下只走一个.
        info("Evaluate the following checkpoints: %s", checkpoints)
        model.to(self.finetune_config.device)
        # 这里用 model_predict.
        predict_results = self.model_predict(model= model, tokenizer=self.finetune_config.tokenizer)
        return predict_results
 def setup_cuda(self):
     if self.local_rank == -1 or self.no_cuda:
         print("torch.cuda.is_available()",torch.cuda.is_available())
         device = torch.device("cuda" if torch.cuda.is_available() and not self.no_cuda else "cpu")
         self.n_gpu = torch.cuda.device_count()
         info("local_rank = %d, and n_gpu = %d" %(self.local_rank, self.n_gpu))
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(self.local_rank)
         device = torch.device("cuda", self.local_rank)
         torch.distributed.init_process_group(backend='nccl')
         self.n_gpu = 1
         info("local_rank = %d, and n_gpu = %d, will use nccl backend." %(self.local_rank, self.n_gpu))
     self.device = device
     info("Device original n_gpu = %d" %(self.n_gpu))
     # must be 1.
     self.n_gpu = 1
     info("Change n_gpu = 1 = %d" %(self.n_gpu))
     self.logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                self.local_rank, self.device, self.n_gpu, bool(self.local_rank != -1), self.fp16)
    def train_model_process(self, train_x_data: list, train_y_labels: list, model):
        train_start = time.time()
        # 初始化 train_data_processor.喂进去 train_data, 得到对应 train_processors.
        self.finetune_config.setup_processors(train_x_data, train_y_labels, data_mode="train")

        # train 的时候,通过 evaluate 字段来控制.
        self.train_dataset = self.load_and_cache_examples(self.finetune_config.task_name, self.finetune_config.tokenizer, data_tag="train")
        info("self.train_dataset type = %s " %(type(self.train_dataset)))
        if model is None:
            info("finetune fisrt time on pretrain_model!")
            global_step, tr_loss, eval_logs, model  = self.train_model(self.train_dataset, self.finetune_config.model, self.finetune_config.tokenizer)
            self.model = model
        else:
            info("finetune Not fisrt time on finetune_model!")
            global_step, tr_loss, eval_logs, model = self.train_model(self.train_dataset, model,
                                                                      self.finetune_config.tokenizer)


        info(" global_step = %s, average loss = %s", global_step, tr_loss)

        train_end = time.time()
        info(eval_logs)
        info('train used {}sec'.format(train_end - train_start))
        return model
    def train_model(self, train_dataset, model, tokenizer):
        """ Train the model """
        eval_logs = []
        if self.finetune_config.local_rank in [-1, 0]:
            pass

        # 原来支持多GPU,但是现在强制 n_gpu=1. 不影响.
        self.finetune_config.train_batch_size = self.finetune_config.per_gpu_train_batch_size * max(1, self.finetune_config.n_gpu)
        # 还是本地,不分布式.
        train_sampler = RandomSampler(train_dataset) if self.finetune_config.local_rank == -1 else DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.finetune_config.train_batch_size)

        if self.finetune_config.max_steps > 0:
            # 表示设置最大次数上限.
            t_total = self.finetune_config.max_steps
            self.finetune_config.num_train_epochs = self.finetune_config.max_steps // (len(train_dataloader) // self.finetune_config.gradient_accumulation_steps) + 1
        else:
            # 表示不设置最大上限,完全根据 train_dataloader长度来.
            t_total = len(train_dataloader) // self.finetune_config.gradient_accumulation_steps * self.finetune_config.num_train_epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': self.finetune_config.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.finetune_config.learning_rate, eps=self.finetune_config.adam_epsilon)
        # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.finetune_config.warmup_steps, t_total=t_total)
        scheduler =get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.finetune_config.warmup_steps, num_training_steps=t_total
        )
        if self.finetune_config.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            model, optimizer = amp.initialize(model, optimizer, opt_level=self.finetune_config.fp16_opt_level)

        # 不做多GPU支持.
        # multi-gpu training (should be after apex fp16 initialization)
        info("n_gpu = %d, model origin type = %s" %(self.finetune_config.n_gpu, type(model)))
        if self.finetune_config.n_gpu > 1:
            info("n_gpu = %d, model origin type = %s" %(self.finetune_config.n_gpu, type(model)))
            model = torch.nn.DataParallel(model)
            info("n_gpu = %d, model after type = %s" %(self.finetune_config.n_gpu, type(model)))

        # -1, 表示就用本地模式.
        # Distributed training (should be after apex fp16 initialization)
        if self.finetune_config.local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[self.finetune_config.local_rank],
                                                              output_device=self.finetune_config.local_rank,
                                                              find_unused_parameters=True)

        # Train!
        info("***** Running training *****")
        info("  Num examples = %d", len(train_dataset))
        info("  Num Epochs = %d", self.finetune_config.num_train_epochs)
        info("  Instantaneous batch size per GPU = %d", self.finetune_config.per_gpu_train_batch_size)
        info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                    self.finetune_config.train_batch_size * self.finetune_config.gradient_accumulation_steps * (
                        torch.distributed.get_world_size() if self.finetune_config.local_rank != -1 else 1))
        info("  Gradient Accumulation steps = %d", self.finetune_config.gradient_accumulation_steps)
        info("  Total optimization steps = %d", t_total)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(self.finetune_config.num_train_epochs), desc="Epoch", disable=self.finetune_config.local_rank not in [-1, 0])
        self.finetune_config.set_seed()  # Added here for reproductibility (even between python 2 and 3)
        start_time = time.time()
        time_spent_evals = []
        i = 0

        time_train_iterator_start = time.time()
        for _ in train_iterator:
            i+=1
            info("Finetune bert, train_model train_iterator i = %d" %(i))
            epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.finetune_config.local_rank not in [-1, 0])
            for step, batch in enumerate(epoch_iterator):
                model.train()
                batch = tuple(t.to(self.finetune_config.device) for t in batch)
                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
                if self.finetune_config.model_type != "distilbert":
                    inputs["token_type_ids"] = (
                        batch[2] if self.finetune_config.model_type in ["bert", "xlnet", "albert"] else None
                    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids

                outputs = model(**inputs)
                loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

                if self.finetune_config.n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu parallel training
                if self.finetune_config.gradient_accumulation_steps > 1:
                    loss = loss / self.finetune_config.gradient_accumulation_steps

                if self.finetune_config.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.finetune_config.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), self.finetune_config.max_grad_norm)

                tr_loss += loss.item()
                if (step + 1) % self.finetune_config.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logging_loss = tr_loss
                if self.finetune_config.max_steps > 0 and global_step > self.finetune_config.max_steps:
                    epoch_iterator.close()
                    break
            if self.finetune_config.max_steps > 0 and global_step > self.finetune_config.max_steps:
                train_iterator.close()
                break

        if self.finetune_config.local_rank in [-1, 0]:
            pass

        time_train_iterator_end = time.time()
        self.finetune_summary_info["train_iterator_time"] = time_train_iterator_end - time_train_iterator_start

        # fixme: 最后一次一定保存 checkpoint. Save model checkpoint
        output_dir = os.path.join(self.finetune_config.output_dir, 'checkpoint-{}'.format(global_step))
        import pathlib
        pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

        model_to_save = model.module if hasattr(model,
                                                'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        info("Finally Saving model checkpoint to %s", output_dir)
        return global_step, tr_loss / global_step , eval_logs, model
    def eval_bert_simple(self, simple_model, model, tokenizer, prefix=""):
        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_task_names = ("mnli", "mnli-mm") if self.finetune_config.task_name == "mnli" else (self.finetune_config.task_name,)
        eval_outputs_dirs = (self.finetune_config.output_dir, self.finetune_config.output_dir + '-MM') if self.finetune_config.task_name == "mnli" else (self.finetune_config.output_dir,)

        results = {}
        results['stage']='eval_bert_simple'
        for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
            eval_dataset = self.load_and_cache_examples(eval_task, tokenizer, evaluate=True)

            if not os.path.exists(eval_output_dir) and self.finetune_config.local_rank in [-1, 0]:
                os.makedirs(eval_output_dir)

            self.finetune_config.eval_batch_size = self.finetune_config.per_gpu_eval_batch_size * max(1, self.finetune_config.n_gpu)
            # Note that DistributedSampler samples randomly
            eval_sampler = SequentialSampler(eval_dataset) if self.finetune_config.local_rank == -1 else DistributedSampler(eval_dataset)
            eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=self.finetune_config.eval_batch_size)

            # Eval!
            info("***** Running bert simple evaluation {} *****".format(prefix))
            info("  Num examples = %d", len(eval_dataset))
            info("  Batch size = %d", self.finetune_config.eval_batch_size)
            eval_loss = 0.0
            nb_eval_steps = 0
            preds = None
            out_label_ids = None
            bert_embedding = None
            for batch in tqdm(eval_dataloader, desc="Evaluating"):
                model.eval()
                batch = tuple(t.to(self.finetune_config.device) for t in batch)

                with torch.no_grad():
                    inputs = {'input_ids': batch[0],
                              'attention_mask': batch[1],
                              'token_type_ids': batch[2] if self.finetune_config.model_type in ['bert', 'xlnet'] else None,
                              # XLM and RoBERTa don't use segment_ids
                              'labels': batch[3]}

                    bert_model = model.bert
                    a = bert_model(batch[0])
                    batch_hidden_state = a[1]

                    outputs = model(**inputs)
                    tmp_eval_loss, logits = outputs[:2]

                    eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1
                if preds is None:
                    preds = logits.detach().cpu().numpy()
                    out_label_ids = inputs['labels'].detach().cpu().numpy()
                    bert_embedding = batch_hidden_state.detach().cpu().numpy()
                else:
                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                    bert_embedding = np.append(bert_embedding, batch_hidden_state.detach().cpu().numpy(), axis=0)
                    out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
            # need pred float point ad out_label_ids

            eval_loss = eval_loss / nb_eval_steps
            if self.finetune_config.output_mode == "classification":
                probs = self.sigmoid(preds)
                # preds = np.argmax(preds, axis=1)
                preds = np.argmax(probs, axis=1)

                #bert_classificar_auc = roc_auc_score(out_label_ids, preds)
                bert_classificar_auc = np_array_auc(out_label_ids, preds)
                #bert_classificar_auc = roc_auc_score(out_label_ids, probs[:, 1])

                #bert_probs = simple_model.predict_proba(bert_embedding)
                bert_probs = simple_model.predict(bert_embedding)
                #bert_simple_auc_on_eval = roc_auc_score(out_label_ids, bert_probs)#[:, 1]
                bert_simple_auc_on_eval = np_array_auc(out_label_ids, bert_probs)#[:, 1]
                info("bert auc on evaluate is {}".format(bert_simple_auc_on_eval))
                # print(auc)

            elif self.finetune_config.output_mode == "regression":
                preds = np.squeeze(preds)

            result = compute_metrics(eval_task, preds, out_label_ids)
            if self.finetune_config.output_mode == "classification":
                result['bert_cls_auc_on_eval'] = bert_classificar_auc
                result['bert_simple_auc_on_eval'] = bert_simple_auc_on_eval
            results.update(result)
            output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                info("***** Eval results {} *****".format(prefix))
                for key in sorted(result.keys()):
                    info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        return results
        pass
    def evaluate_model(self, model, tokenizer, prefix=""):
        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_task_names = ("mnli", "mnli-mm") if self.finetune_config.task_name == "mnli" else (self.finetune_config.task_name,)
        eval_outputs_dirs = (self.finetune_config.output_dir, self.finetune_config.output_dir + '-MM') if self.finetune_config.task_name == "mnli" else (self.finetune_config.output_dir,)

        results = {}
        for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
            eval_dataset = self.load_and_cache_examples(eval_task, tokenizer, data_tag = "dev")

            if not os.path.exists(eval_output_dir) and self.finetune_config.local_rank in [-1, 0]:
                os.makedirs(eval_output_dir)

            self.finetune_config.eval_batch_size = self.finetune_config.per_gpu_eval_batch_size * max(1, self.finetune_config.n_gpu)
            # Note that DistributedSampler samples randomly
            eval_sampler = SequentialSampler(eval_dataset) if self.finetune_config.local_rank == -1 else DistributedSampler(eval_dataset)
            eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=self.finetune_config.eval_batch_size)

            # Eval!
            info("***** Running evaluation {} *****".format(prefix))
            info("  Num examples = %d", len(eval_dataset))
            info("  Batch size = %d", self.finetune_config.eval_batch_size)
            eval_loss = 0.0
            nb_eval_steps = 0
            preds = None
            out_label_ids = None
            bert_embedding = None
            for batch in tqdm(eval_dataloader, desc="Evaluating"):
                model.eval()
                batch = tuple(t.to(self.finetune_config.device) for t in batch)

                with torch.no_grad():
                    # inputs = {'input_ids': batch[0],
                    #           'attention_mask': batch[1],
                    #           'token_type_ids': batch[2] if self.finetune_config.model_type in ['bert', 'xlnet'] else None,
                    #           # XLM and RoBERTa don't use segment_ids
                    #           'labels': batch[3]}
                    inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
                    if self.finetune_config.model_type != "distilbert":
                        inputs["token_type_ids"] = (
                            batch[2] if self.finetune_config.model_type in ["bert", "xlnet", "albert"] else None
                        )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
                    info("model type = %s" %(type(model)))

                    # bert_model = model.bert
                    # a = bert_model(batch[0])
                    # batch_hidden_state = a[1]

                    outputs = model(**inputs)
                    tmp_eval_loss, logits = outputs[:2]

                    eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1
                if preds is None:
                    preds = logits.detach().cpu().numpy()
                    out_label_ids = inputs['labels'].detach().cpu().numpy()
                    # bert_embedding = batch_hidden_state.detach().cpu().numpy()
                else:
                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                    # bert_embedding = np.append(bert_embedding, batch_hidden_state.detach().cpu().numpy(), axis=0)
                    out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
            # need pred float point ad out_label_ids

            eval_loss = eval_loss / nb_eval_steps
            if self.finetune_config.output_mode == "classification":
                pass

            elif self.finetune_config.output_mode == "regression":
                preds = np.squeeze(preds)


            # 返回predict的label所在索引号结果.
            return preds
    def model_predict(self, model, tokenizer, prefix="", data_tag="test"):
        # Loop to handle MNLI double evaluation (matched, mis-matched)
        # 在这里task_name的变化没用.
        # eval_task_names = ("mnli", "mnli-mm") if self.finetune_config.task_name == "mnli" else (self.finetune_config.task_name,)
        predict_task = self.finetune_config.task_name
        # eval_outputs_dirs = (self.finetune_config.output_dir, self.finetune_config.output_dir + '-MM') if self.finetune_config.task_name == "mnli" else (self.finetune_config.output_dir,)
        predict_output_dir = self.finetune_config.output_dir

        # for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        # 获取待predict的数据dataset,通过 data_tag 控制操作,前面需要事前设置 processor.
        # 如果是test,为了喂数据的格式符合,labels还是要存在,只是后面不用,和train/eval不同.

        time_load_cache_example_start = time.time()
        test_dataset = self.load_and_cache_examples(predict_task, tokenizer, data_tag = data_tag)
        time_load_cache_example_end = time.time()
        self.finetune_summary_info["test_loadcache_time"] = time_load_cache_example_end - time_load_cache_example_start
        self.time_checker.check("Test predict, load cache examples done.")

        # 逻辑就是 output 要新建,如果存在则删除新建.
        if not os.path.exists(predict_output_dir) and self.finetune_config.local_rank in [-1, 0]:
            os.makedirs(predict_output_dir)
            pass
        # 这里n_gpu=1,所以没用.
        self.finetune_config.eval_batch_size = self.finetune_config.per_gpu_eval_batch_size * max(1, self.finetune_config.n_gpu)
        # Note that DistributedSampler samples randomly
        # 对原始数据采样,得到分batch之后结果.
        test_sampler = SequentialSampler(test_dataset) if self.finetune_config.local_rank == -1 else DistributedSampler(test_dataset)
        # dataloader, 有 list 类型操作.
        test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=self.finetune_config.eval_batch_size)

        # Predict for Test. Eval!
        info("***** Running prediction, for test{} *****".format(prefix))
        info("  Num examples = %d", len(test_dataset))
        info("  Batch size = %d", self.finetune_config.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        bert_embedding = None

        time_predict_batch_start = time.time()
        for batch in tqdm(test_dataloader, desc="Predicting"):
            # 注意这里和 model.train不同,test的时候也是 model.eval().
            model.eval()
            # 把待预测的数据,分别写到device GPU里.
            batch = tuple(t.to(self.finetune_config.device) for t in batch)

            with torch.no_grad():
                # inputs = {'input_ids': batch[0],
                #           'attention_mask': batch[1],
                #           'token_type_ids': batch[2] if self.finetune_config.model_type in ['bert', 'xlnet'] else None,
                #           # XLM and RoBERTa don't use segment_ids
                #           'labels': batch[3]}
                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
                if self.finetune_config.model_type != "distilbert":
                    inputs["token_type_ids"] = (
                        batch[2] if self.finetune_config.model_type in ["bert", "xlnet", "albert"] else None
                    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids

                info("model type = %s" %(type(model)))

                # 这里只是为了获得 bert 的导数第二层的embedding.
                # bert_model = model.bert
                # a = bert_model(batch[0])
                # batch_hidden_state = a[1]

                # 直接根据model进行预测.
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                # 预测的结果从 gpu到cpu到numpy内容形式.
                preds = logits.detach().cpu().numpy()
                # 在test的时候这个没用,也不评估eval, 不用算loss.
                out_label_ids = inputs['labels'].detach().cpu().numpy()
                # bert_embedding = batch_hidden_state.detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                # bert_embedding = np.append(bert_embedding, batch_hidden_state.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

        time_predict_batch_end = time.time()
        self.finetune_summary_info["test_predict_batch_time"] = time_predict_batch_end - time_predict_batch_start

        self.time_checker.check("Test predict, predict batches done.")

        # need pred float point ad out_label_ids

        # 查看直接预测的结果: preds 是预测分数,numpy形式;out_label_ids 是预测的labels的编号,numpy类型.
        #info("Predict preds, origin scores type = %s, shape = %s, content-top3= %s" %(type(preds), preds.shape, str(preds[:3])))
        #info("Only Show preparetion, do not train or eval, output_label_ids, type = %s, shape = %s, content-top3 = %s" %(type(out_label_ids), out_label_ids.shape, str(out_label_ids[:3])))
        # eval_loss = eval_loss / nb_eval_steps
        if self.finetune_config.output_mode == "classification":
            pass


        elif self.finetune_config.output_mode == "regression":
            preds = np.squeeze(preds)

        # 返回predict的label所在索引号结果.
        return preds
    def load_and_cache_examples(self, task, tokenizer, data_tag="train"):
        """
        目的是返回 TensorDataset, 可能是Train数据,可能是 Dev/Test数据.
        tag可选为 train/dev/test
        :param task:
        :param tokenizer:
        :param evaluate:
        :return: TensorDataset类型。如果第一次,读取features的同时保存features到cache。如果是第一次之后,直接从cache中读取.
        """
        if self.finetune_config.local_rank not in [-1, 0]:
            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

        # processor = processors[task]()
        output_mode = self.finetune_config.output_mode
        # Load data features from cache or dataset file. 第一次是从数据processor的examples来,后面都从 cache来.
        #cached_features_file = os.path.join(self.finetune_config.data_dir, 'cached_{}_{}_{}_{}'.format(
        # 通过data_tag来区分, train/dev/test, 分别在不同的cache_file.
        cached_features_file = os.path.join(os.path.dirname(__file__), 'cached_{}_{}_{}_{}'.format(
            # 'dev' if evaluate else 'train',
            data_tag,
            self.finetune_config.model_type,
            str(self.finetune_config.max_seq_length),
            str(task)))

        # 如果第一次,走else流程,从xx_processors中先拿到examples,再转换生成features. 第一次之后只走if流程.
        if os.path.exists(cached_features_file) and not ProjectPathConfig.If_Overwrite_cache:
            print("!!!!!!!!!!!!\n\n")
            info("Data_tag = %s, Loading features from cached file %s" %(data_tag, cached_features_file))
            features = torch.load(cached_features_file)
        else:
            # self.logger.info("Creating features from dataset file at %s", self.finetune_config.data_dir)
            # 注意,label_list只从 train_processor中获取.
            label_list = self.finetune_config.train_data_processor.get_labels()
            # 不会用到,下面的路不会走到.
            if task in ['mnli', 'mnli-mm'] and self.finetune_config.model_type in ['roberta']:
                # HACK(label indices are swapped in RoBERTa pretrained model)
                label_list[1], label_list[2] = label_list[2], label_list[1]

            # 第一遍eval是False,因为来train.
            if data_tag == "train":
                examples = self.finetune_config.train_data_processor.get_examples()
                info("Datatag = train, get examples from train_data_processor.")
                pass
            elif data_tag == "dev":
                examples = self.finetune_config.dev_data_processor.get_examples()
                info("Datatag = dev, get examples from dev_data_processor.")
                pass
            elif data_tag == "test":
                examples = self.finetune_config.test_data_processor.get_examples()
                info("Datatag = test, get examples from test_data_processor.")
                pass
            else:
                pass
            # examples = processor.get_dev_examples(self.finetune_config.data_dir) if evaluate else processor.get_train_examples(
            #     self.finetune_config.data_dir)
            # 从 example到features.

            # fixme: 不用预设的最大文本长度,用当前文本最大长度(动态文本长度)

            features = convert_examples_to_features(examples, label_list, self.finetune_config.max_seq_length, tokenizer, output_mode,
                                                    cls_token_at_end=bool(self.finetune_config.model_type in ['xlnet']),
                                                    # xlnet has a cls token at the end
                                                    cls_token=tokenizer.cls_token,
                                                    cls_token_segment_id=2 if self.finetune_config.model_type in ['xlnet'] else 0,
                                                    sep_token=tokenizer.sep_token,
                                                    sep_token_extra=False, # bool(self.finetune_config.model_type in ['roberta'])
                                                    # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                                                    pad_on_left=bool(self.finetune_config.model_type in ['xlnet']),
                                                    # pad on the left for xlnet
                                                    pad_token=tokenizer.encoder[tokenizer.pad_token] if self.finetune_config.model_type in [
                                                        'roberta'] else tokenizer.encode(tokenizer.pad_token)[0],
                                                        #'roberta'] else tokenizer.vocab[tokenizer.pad_token],
                                                    pad_token_segment_id=4 if self.finetune_config.model_type in ['xlnet'] else 0,
                                                    )
            if self.finetune_config.local_rank in [-1, 0]:
                info("Saving features into cached file %s", cached_features_file)
                # 第一次的时候,把所有features都保存到缓存cache里。注意:train的cache和dev/test的cache是分开的.
                torch.save(features, cached_features_file)

        if self.finetune_config.local_rank == 0:
            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        return dataset
    def load_prertrained_models_tokenizer(self):
        info("Finetuen Training/evaluation parameters %s" %(self.to_string()))
        # Load pretrained model and tokenizer
        if self.local_rank not in [-1, 0]:
            torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

        self.model_type = self.model_type.lower()
        print("[info]model type",self.model_type)
        config_class, self.model_class, tokenizer_class = MODEL_CLASSES[self.model_type]

        info("load pretrained model, config class type = %s, model_class type = %s, tokenizer_class = %s" %(type(config_class),  type(self.model_class), type(tokenizer_class)))
        info("load pretrained model, config class = %s, model_class = %s, tokenizer_class = %s" %(config_class, self.model_class, tokenizer_class))

        self.config = config_class.from_pretrained(self.config_name if self.config_name else self.model_name_or_path,
                                              num_labels=self.num_labels, finetuning_task=self.task_name)

        info("load pretrained model, config type = %s, config = %s" %(type(self.config), self.config))

        self.tokenizer = tokenizer_class.from_pretrained(self.tokenizer_name if self.tokenizer_name else self.model_name_or_path,
                                                    do_lower_case=self.do_lower_case)

        info("load pretrained model, tokenizer type = %s, content = %s" %(type(self.tokenizer), self.tokenizer))

        self.model = self.model_class.from_pretrained(self.model_name_or_path, from_tf=bool('.ckpt' in self.model_name_or_path),
                                            config=self.config)
        info("load pretrained model, model type = %s" %(type(self.model)))

        ############################## 定义哪些层冻结 ######################################
        if self.model_type == 'distil_roberta':
            for param in self.model.roberta.encoder.layer[:3].parameters():
                param.requires_grad = False

        if self.local_rank == 0:
            info("load pretrained model, local_rank = 0 ")
            torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

        self.model.to(self.device)

        info("Training/evaluation parameters %s" %(self.to_string()))