Ejemplo n.º 1
0
 def __init__(self, args, device='cpu'):
     print(args.bert_model)
     self.tokenizer = ElectraTokenizer.from_pretrained(args.bert_model)
     self.data_dir = args.data_dir
     file_list = get_json_file_list(args.data_dir)
     self.data = []
     #max_article_len = 0
     for file_name in file_list:
         data = json.loads(open(file_name, 'r').read())
         data['high'] = 0
         if ('high' in file_name):
             data['high'] = 1
         self.data.append(data)
         #max_article_len = max(max_article_len, len(nltk.word_tokenize(data['article'])))
     self.data_objs = []
     high_cnt = 0
     middle_cnt = 0
     for sample in self.data:
         high_cnt += sample['high']
         middle_cnt += (1 - sample['high'])
         self.data_objs += self._create_sample(sample)
         #break
     print('high school sample:', high_cnt)
     print('middle school sample:', middle_cnt)
     for i in range(len(self.data_objs)):
         self.data_objs[i].convert_tokens_to_ids(self.tokenizer)
         #break
     torch.save(self.data_objs, args.save_name)
Ejemplo n.º 2
0
    def load(cls,
             pretrained_model_name_or_path,
             tokenizer_class=None,
             use_fast=False,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        `pretrained_model_name_or_path` or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
            Only DistilBERT, BERT and Electra fast tokenizers are supported.
        :type use_fast: bool
        :param kwargs:
        :return: Tokenizer
        """

        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        # guess tokenizer type from name
        if tokenizer_class is None:
            if "albert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "AlbertTokenizer"
            elif "xlm-roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLMRobertaTokenizer"
            elif "roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "RobertaTokenizer"
            elif 'codebert' in pretrained_model_name_or_path.lower():
                if "mlm" in pretrained_model_name_or_path.lower():
                    raise NotImplementedError(
                        "MLM part of codebert is currently not supported in FARM"
                    )
                else:
                    tokenizer_class = "RobertaTokenizer"
            elif "camembert" in pretrained_model_name_or_path.lower(
            ) or "umberto" in pretrained_model_name_or_path:
                tokenizer_class = "CamembertTokenizer"
            elif "distilbert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "DistilBertTokenizer"
            elif "bert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "xlnet" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLNetTokenizer"
            elif "electra" in pretrained_model_name_or_path.lower():
                tokenizer_class = "ElectraTokenizer"
            elif "word2vec" in pretrained_model_name_or_path.lower() or \
                    "glove" in pretrained_model_name_or_path.lower() or \
                    "fasttext" in pretrained_model_name_or_path.lower():
                tokenizer_class = "EmbeddingTokenizer"
            elif "minilm" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "dpr-question_encoder" in pretrained_model_name_or_path.lower(
            ):
                tokenizer_class = "DPRQuestionEncoderTokenizer"
            elif "dpr-ctx_encoder" in pretrained_model_name_or_path.lower():
                tokenizer_class = "DPRContextEncoderTokenizer"
            else:
                raise ValueError(
                    f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set "
                    f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, "
                    f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or "
                    f"XLNetTokenizer.")
            logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        ret = None
        if tokenizer_class == "AlbertTokenizer":
            if use_fast:
                logger.error(
                    'AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.'
                )
                ret = AlbertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = AlbertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif tokenizer_class == "XLMRobertaTokenizer":
            if use_fast:
                logger.error(
                    'XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.'
                )
                ret = XLMRobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = XLMRobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "RobertaTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                logger.error(
                    'RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.'
                )
                ret = RobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = RobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DistilBertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = DistilBertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DistilBertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "BertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = BertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = BertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            if use_fast:
                logger.error(
                    'XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.'
                )
                ret = XLNetTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = XLNetTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "ElectraTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = ElectraTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = ElectraTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            if use_fast:
                logger.error(
                    'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.'
                )
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "CamembertTokenizer":
            if use_fast:
                logger.error(
                    'CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.'
                )
                ret = CamembertTokenizer._from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = CamembertTokenizer._from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DPRQuestionEncoderTokenizer" or tokenizer_class == "DPRQuestionEncoderTokenizerFast":
            if use_fast or tokenizer_class == "DPRQuestionEncoderTokenizerFast":
                ret = DPRQuestionEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRQuestionEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DPRContextEncoderTokenizer" or tokenizer_class == "DPRContextEncoderTokenizerFast":
            if use_fast or tokenizer_class == "DPRContextEncoderTokenizerFast":
                ret = DPRContextEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRContextEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Ejemplo n.º 3
0
    def load(cls,
             pretrained_model_name_or_path,
             tokenizer_class=None,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        `pretrained_model_name_or_path` or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param kwargs:
        :return: Tokenizer
        """

        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        # guess tokenizer type from name
        if tokenizer_class is None:
            if "albert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "AlbertTokenizer"
            elif "xlm-roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLMRobertaTokenizer"
            elif "roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "RobertaTokenizer"
            elif "camembert" in pretrained_model_name_or_path.lower(
            ) or "umberto" in pretrained_model_name_or_path:
                tokenizer_class = "CamembertTokenizer"
            elif "distilbert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "DistilBertTokenizer"
            elif "bert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "xlnet" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLNetTokenizer"
            elif "electra" in pretrained_model_name_or_path.lower():
                tokenizer_class = "ElectraTokenizer"
            elif "word2vec" in pretrained_model_name_or_path.lower() or \
                    "glove" in pretrained_model_name_or_path.lower() or \
                    "fasttext" in pretrained_model_name_or_path.lower():
                tokenizer_class = "EmbeddingTokenizer"
            elif "minilm" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            else:
                raise ValueError(
                    f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set "
                    f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, "
                    f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or "
                    f"XLNetTokenizer.")
            logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        if tokenizer_class == "AlbertTokenizer":
            ret = AlbertTokenizer.from_pretrained(
                pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif tokenizer_class == "XLMRobertaTokenizer":
            ret = XLMRobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "RobertaTokenizer":
            ret = RobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DistilBertTokenizer":
            ret = DistilBertTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "BertTokenizer":
            ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                 keep_accents=True,
                                                 **kwargs)
        elif tokenizer_class == "ElectraTokenizer":
            ret = ElectraTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            ret = EmbeddingTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "CamembertTokenizer":
            ret = CamembertTokenizer._from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Ejemplo n.º 4
0
    def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=False, **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        model config or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
            Only DistilBERT, BERT and Electra fast tokenizers are supported.
        :type use_fast: bool
        :param kwargs:
        :return: Tokenizer
        """
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)

        if tokenizer_class is None:
            tokenizer_class = cls._infer_tokenizer_class(pretrained_model_name_or_path)

        logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        ret = None
        if tokenizer_class == "AlbertTokenizer":
            if use_fast:
                logger.error('AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.')
                ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True,  **kwargs)
        elif tokenizer_class == "XLMRobertaTokenizer":
            if use_fast:
                logger.error('XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.')
                ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif "RobertaTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                logger.error('RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.')
                ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif "DistilBertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif "BertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            if use_fast:
                logger.error('XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.')
                ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "ElectraTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            if use_fast:
                logger.error('EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.')
                ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "CamembertTokenizer":
            if use_fast:
                logger.error('CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.')
                ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DPRQuestionEncoderTokenizer" or tokenizer_class == "DPRQuestionEncoderTokenizerFast":
            if use_fast or tokenizer_class == "DPRQuestionEncoderTokenizerFast":
                ret = DPRQuestionEncoderTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRQuestionEncoderTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DPRContextEncoderTokenizer" or tokenizer_class == "DPRContextEncoderTokenizerFast":
            if use_fast or tokenizer_class == "DPRContextEncoderTokenizerFast":
                ret = DPRContextEncoderTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRContextEncoderTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Ejemplo n.º 5
0
    def analyze(self):
        # electra config 객체 생성
        electra_config = ElectraConfig.from_pretrained(
            os.path.join(self.config["model_dir_path"],
                         "checkpoint-{}".format(self.config["checkpoint"])),
            num_labels=self.config["senti_labels"],
            cache_dir=None)

        # electra tokenizer 객체 생성
        electra_tokenizer = ElectraTokenizer.from_pretrained(
            os.path.join(self.config["model_dir_path"],
                         "checkpoint-{}".format(self.config["checkpoint"])),
            do_lower_case=False,
            cache_dir=None)

        # electra model 객체 생성
        electra_model = ElectraForSequenceClassification.from_pretrained(
            os.path.join(self.config["model_dir_path"],
                         "checkpoint-{}".format(self.config["checkpoint"])),
            config=electra_config,
            lstm_hidden=self.config['lstm_hidden'],
            label_emb_size=self.config['lstm_hidden'] * 2,
            score_emb_size=self.config['lstm_hidden'] * 2,
            score_size=self.config['score_labels'],
            num_layer=self.config['lstm_num_layer'],
            bilstm_flag=self.config['bidirectional_flag'],
            cache_dir=self.config["cache_dir_path"])

        electra_model.cuda()

        # 평가 데이터 읽기
        test_datas = preprocessing.read_data(
            file_path=self.config["analyze_data_path"],
            mode=self.config["mode"])

        # 평가 데이터 전처리
        test_dataset = preprocessing.convert_data2dataset(
            datas=test_datas,
            tokenizer=electra_tokenizer,
            max_length=self.config["max_length"],
            labels=self.config["senti_labels"],
            score_labels=self.config["score_labels"],
            mode=self.config["mode"])

        # 평가 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성
        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(test_dataset,
                                     sampler=test_sampler,
                                     batch_size=100)

        electra_model.eval()

        # 평가 데이터에 대한 정확도와 모델의 입력, 출력, 정답
        test_accuracy, total_input_ids, total_predicts, total_corrects = self.do_analyze(
            electra_model=electra_model,
            test_dataloader=test_dataloader,
            mode=self.config["mode"])

        print("test_accuracy : {}\n".format(round(test_accuracy, 4)))

        print("테스트 데이터 10개에 대하여 모델 출력과 정답을 비교")
        # 10개의 평가 케이스에 대하여 모델 출력과 정답 비교
        self.show_result(total_input_ids=total_input_ids[:10],
                         total_predicts=total_predicts[:10],
                         total_corrects=total_corrects[:10],
                         tokenizer=electra_tokenizer)
Ejemplo n.º 6
0
    def train(self):
        #########################################################################################################################################
        # electra config 객체 생성
        electra_config = ElectraConfig.from_pretrained(
            "/home/mongjin/KuELECTRA_base",
            num_labels=self.config["senti_labels"],
            cache_dir=self.config["cache_dir_path"])

        # electra tokenizer 객체 생성
        electra_tokenizer = ElectraTokenizer.from_pretrained(
            "/home/mongjin/KuELECTRA_base",
            do_lower_case=False,
            cache_dir=self.config["cache_dir_path"])

        # electra model 객체 생성
        electra_model = ElectraForSequenceClassification.from_pretrained(
            "/home/mongjin/KuELECTRA_base",
            config=electra_config,
            lstm_hidden=self.config['lstm_hidden'],
            label_emb_size=self.config['lstm_hidden'] * 2,
            score_emb_size=self.config['lstm_hidden'] * 2,
            score_size=self.config['score_labels'],
            num_layer=self.config['lstm_num_layer'],
            bilstm_flag=self.config['bidirectional_flag'],
            cache_dir=self.config["cache_dir_path"],
            from_tf=True)
        #########################################################################################################################################

        electra_model.cuda()

        # 학습 데이터 읽기
        train_datas = preprocessing.read_data(
            file_path=self.config["train_data_path"], mode=self.config["mode"])

        # 학습 데이터 전처리
        train_dataset = preprocessing.convert_data2dataset(
            datas=train_datas,
            tokenizer=electra_tokenizer,
            max_length=self.config["max_length"],
            labels=self.config["senti_labels"],
            score_labels=self.config["score_labels"],
            mode=self.config["mode"])

        # 학습 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=self.config["batch_size"])

        # 평가 데이터 읽기
        test_datas = preprocessing.read_data(
            file_path=self.config["test_data_path"], mode=self.config["mode"])

        # 평가 데이터 전처리
        test_dataset = preprocessing.convert_data2dataset(
            datas=test_datas,
            tokenizer=electra_tokenizer,
            max_length=self.config["max_length"],
            labels=self.config["senti_labels"],
            score_labels=self.config["score_labels"],
            mode=self.config["mode"])

        # 평가 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성
        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(test_dataset,
                                     sampler=test_sampler,
                                     batch_size=100)

        # 전체 학습 횟수(batch 단위)
        t_total = len(train_dataloader) // self.config[
            "gradient_accumulation_steps"] * self.config["epoch"]

        # 모델 학습을 위한 optimizer
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer = AdamW([{
            'params': [
                p for n, p in electra_model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'lr':
            5e-5,
            'weight_decay':
            self.config['weight_decay']
        }, {
            'params': [
                p for n, p in electra_model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'lr':
            5e-5,
            'weight_decay':
            0.0
        }])
        # optimizer = AdamW(lan.parameters(), lr=self.config['learning_rate'], eps=self.config['adam_epsilon'])
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.config["warmup_steps"],
            num_training_steps=t_total)

        if os.path.isfile(
                os.path.join(self.config["model_dir_path"],
                             "optimizer.pt")) and os.path.isfile(
                                 os.path.join(self.config["model_dir_path"],
                                              "scheduler.pt")):
            # 기존에 학습했던 optimizer와 scheduler의 정보 불러옴
            optimizer.load_state_dict(
                torch.load(
                    os.path.join(self.config["model_dir_path"],
                                 "optimizer.pt")))
            scheduler.load_state_dict(
                torch.load(
                    os.path.join(self.config["model_dir_path"],
                                 "scheduler.pt")))
            print(
                "#######################     Success Load Model     ###########################"
            )

        global_step = 0
        electra_model.zero_grad()
        max_test_accuracy = 0
        for epoch in range(self.config["epoch"]):
            electra_model.train()

            # 학습 데이터에 대한 정확도와 평균 loss
            train_accuracy, average_loss, global_step, score_acc = self.do_train(
                electra_model=electra_model,
                optimizer=optimizer,
                scheduler=scheduler,
                train_dataloader=train_dataloader,
                epoch=epoch + 1,
                global_step=global_step)

            print("train_accuracy : {}\taverage_loss : {}\n".format(
                round(train_accuracy, 4), round(average_loss, 4)))
            print("train_score_accuracy :", "{:.6f}".format(score_acc))

            electra_model.eval()

            # 평가 데이터에 대한 정확도
            test_accuracy, score_acc = self.do_evaluate(
                electra_model=electra_model,
                test_dataloader=test_dataloader,
                mode=self.config["mode"])

            print("test_accuracy : {}\n".format(round(test_accuracy, 4)))
            print("test_score_accuracy :", "{:.6f}".format(score_acc))

            # 현재의 정확도가 기존 정확도보다 높은 경우 모델 파일 저장
            if (max_test_accuracy < test_accuracy):
                max_test_accuracy = test_accuracy

                output_dir = os.path.join(self.config["model_dir_path"],
                                          "checkpoint-{}".format(global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                electra_config.save_pretrained(output_dir)
                electra_tokenizer.save_pretrained(output_dir)
                electra_model.save_pretrained(output_dir)
                # torch.save(lan.state_dict(), os.path.join(output_dir, "lan.pt"))
                torch.save(optimizer.state_dict(),
                           os.path.join(output_dir, "optimizer.pt"))
                torch.save(scheduler.state_dict(),
                           os.path.join(output_dir, "scheduler.pt"))

            print("max_test_accuracy :",
                  "{:.6f}".format(round(max_test_accuracy, 4)))
Ejemplo n.º 7
0
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
import os
from bs4 import BeautifulSoup
'''KorQuAD 2.0에 대한 공식 평가 스크립트 '''
'''본 스크립트는 SQuAD v1.1 평가 스크립트 https://rajpurkar.github.io/SQuAD-explorer/ 를 바탕으로 작성됨.'''
from transformers.tokenization_electra import ElectraTokenizer
tokenizer = ElectraTokenizer.from_pretrained(
    "../../baseline/checkpoint-24000",
    do_lower_case=False,
)


def normalize_answer(s):
    def tag_clean(t):
        return BeautifulSoup(t).get_text()

    def remove_(text):
        ''' 불필요한 기호 제거 '''
        text = re.sub("'", " ", text)
        text = re.sub('"', " ", text)
        text = re.sub('《', " ", text)
        text = re.sub('》', " ", text)
        text = re.sub('<', " ", text)
        text = re.sub('>', " ", text)
        text = re.sub('〈', " ", text)