Esempio n. 1
0
def download_model(outputdir_tokenizer: str, outputdir_pretrained: str):
    slow_tokenizer = ElectraTokenizer.from_pretrained("bert-base-uncased")
    print("Save tokenizer to ", outputdir_tokenizer)
    slow_tokenizer.save_pretrained(outputdir_tokenizer)

    model = ElectraForQuestionAnswering.from_pretrained(
        "google/electra-base-discriminator")
    model.save_pretrained(outputdir_pretrained)
    print("Save model electra pretrained to", outputdir_pretrained)
 def create_and_check_electra_for_question_answering(
     self,
     config,
     input_ids,
     token_type_ids,
     input_mask,
     sequence_labels,
     token_labels,
     choice_labels,
     fake_token_labels,
 ):
     model = ElectraForQuestionAnswering(config=config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         start_positions=sequence_labels,
         end_positions=sequence_labels,
     )
     self.parent.assertEqual(result.start_logits.shape,
                             (self.batch_size, self.seq_length))
     self.parent.assertEqual(result.end_logits.shape,
                             (self.batch_size, self.seq_length))
 def create_and_check_electra_for_question_answering(
     self,
     config,
     input_ids,
     token_type_ids,
     input_mask,
     sequence_labels,
     token_labels,
     choice_labels,
     fake_token_labels,
 ):
     model = ElectraForQuestionAnswering(config=config)
     model.to(torch_device)
     model.eval()
     loss, start_logits, end_logits = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         start_positions=sequence_labels,
         end_positions=sequence_labels,
     )
     result = {
         "loss": loss,
         "start_logits": start_logits,
         "end_logits": end_logits,
     }
     self.parent.assertListEqual(list(result["start_logits"].size()),
                                 [self.batch_size, self.seq_length])
     self.parent.assertListEqual(list(result["end_logits"].size()),
                                 [self.batch_size, self.seq_length])
     self.check_loss_output(result)
Esempio n. 4
0
 def create_model(self, transformer="longformer"):
     if transformer == "distilbert":
         from transformers import DistilBertForQuestionAnswering
         self.model = DistilBertForQuestionAnswering.from_pretrained(
             "distilbert-base-uncased")
     elif transformer == "bert":
         from transformers import BertForQuestionAnswering
         self.model = BertForQuestionAnswering.from_pretrained(
             "bert-base-uncased")
     elif transformer == "roberta":
         from transformers import RobertaForQuestionAnswering
         self.model = RobertaForQuestionAnswering.from_pretrained(
             "roberta-base")
     elif transformer == "roberta_squad":
         from transformers import RobertaForQuestionAnswering
         self.model = RobertaForQuestionAnswering.from_pretrained(
             "deepset/roberta-base-squad2")
     elif transformer == "longformer":
         from transformers import LongformerForQuestionAnswering
         self.model = LongformerForQuestionAnswering.from_pretrained(
             "allenai/longformer-base-4096")
     elif transformer == "bart":
         from transformers import BartForQuestionAnswering
         self.model = BartForQuestionAnswering.from_pretrained(
             "facebook/bart-base")
     elif transformer == "electra":
         from transformers import ElectraForQuestionAnswering
         self.model = ElectraForQuestionAnswering.from_pretrained(
             "google/electra-small-discriminator")
     else:
         print(
             "The model you chose is not available in this version. You can try to manually change the code or manually overwrite the variable self.model"
         )
         print(
             "The available choices are 'distilbert' , 'bert' , 'roberta' , 'longformer' , 'bart' , 'electra' "
         )
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected")

    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected")

    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help="The input data dir. Should contain the .json files for the task."
        +
        "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--train_file",
        default=None,
        type=str,
        help=
        "The input training file. If a data dir is specified, will look for the file there"
        +
        "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help=
        "The input evaluation file. If a data dir is specified, will look for the file there"
        +
        "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )

    parser.add_argument(
        "--version_2_with_negative",
        action="store_true",
        help=
        "If true, the SQuAD examples contain some that do not have an answer.",
    )
    parser.add_argument(
        "--null_score_diff_threshold",
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null.",
    )

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks.",
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        default=True,
        action="store_true",
        help="Run evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json output file.",
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.",
    )
    parser.add_argument(
        "--verbose_logging",
        action="store_true",
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.",
    )

    parser.add_argument("--logging_steps",
                        type=int,
                        default=100,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=10000,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")

    parser.add_argument(
        "--threads",
        type=int,
        default=1,
        help="multiple threads for converting example to features")

    ### DO NOT MODIFY THIS BLOCK ###
    # arguments for nsml
    parser.add_argument('--pause', type=int, default=0)
    parser.add_argument('--mode', type=str, default='train')
    ################################

    args = parser.parse_args()

    # for NSML
    args.data_dir = os.path.join(DATASET_PATH, args.data_dir)

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
        logger.warning('IF args.n_gpu : ' + str(args.n_gpu) + ' / device : ' +
                       str(device) + '\n')
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
        logger.warning('ELSE args.n_gpu : ' + str(args.n_gpu) +
                       ' / device : ' + str(device) + '\n')

    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
        filename='log.log')
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    logger.warning("Model Loading ..")

    config = ElectraConfig.from_pretrained(args.model_name_or_path)
    model = ElectraForQuestionAnswering.from_pretrained(
        args.model_name_or_path, config=config)
    tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path,
                                                 do_lower_case=False)

    logger.warning("Model Loading Completed")

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model.to(args.device)

    ### DO NOT MODIFY THIS BLOCK ###
    if IS_ON_NSML:
        bind_nsml(model, tokenizer, args)
        if args.pause:
            nsml.paused(scope=locals())
    ################################

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is
    # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running
    # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex

            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                evaluate=False,
                                                output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)
Esempio n. 6
0
from flask import Flask, request, render_template
import torch
from transformers import ElectraTokenizer, ElectraForQuestionAnswering
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from tokenizers import BertWordPieceTokenizer
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

from reader import get_answer

model = ElectraForQuestionAnswering.from_pretrained("Reader/electra_QA").to(
    device=torch.device('cpu'))
model.load_state_dict(
    torch.load('Reader/weight_electra/weights_3.pth',
               map_location=torch.device('cpu')))
model.eval()
tokenizer = BertWordPieceTokenizer("Reader/electra_base_uncased/vocab.txt",
                                   lowercase=True)

torch.set_grad_enabled(False)
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base")
q_encoder = DPRQuestionEncoder.from_pretrained(
    "Retrieval/question_encoder").to(device=torch.device('cpu'))
q_encoder.eval()

# ctx_tokenizer = BertWordPieceTokenizer("ctx_tokenizer/vocab.txt", lowercase=True)
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base")
ctx_encoder = DPRContextEncoder.from_pretrained("Retrieval/ctx_encoder").to(
            .format(configurations),
            'results': []
        }, f)

for conf in configurations:

    conf_id = '{}_config_{}_{}_{}'.format(j, *conf)
    print(
        'Run [{}] --learning rate: {}, --hidden size: {}, --hidden layer: {}'.
        format(j, *conf))

    # Initializing a ELECTRA electra-base-uncased style configuration
    electra_conf = ElectraConfig(hidden_size=conf[1],
                                 num_hidden_layers=conf[2])
    # Initializing a model from the electra-base-uncased style configuration
    model = ElectraForQuestionAnswering(electra_conf)
    configuration = model.config

    model.to(device)
    model.train()

    train_loader = DataLoader(train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True)
    learning_rate = conf[0]
    optim = AdamW(model.parameters(), lr=learning_rate)
    loss_records = []
    bi = 0

    for epoch in tqdm(range(EPOCHS)):
        for batch in train_loader:
Esempio n. 8
0
    def __init__(self) -> None:
        self.lists = {}

        # M-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained(
            'bert-base-multilingual-cased')
        self.bert_multilingual_model = BertForMaskedLM.from_pretrained(
            'bert-base-multilingual-cased').eval()
        self.lists["M-BERT"] = {
            "Tokenizer": self.bert_multilingual_tokenizer,
            "Model": self.bert_multilingual_model
        }
        print("====================================")
        print("[BERT] Google Multilingual BERT loaded")
        print("====================================")

        # KR-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.krbert_tokenizer = BertTokenizerFast.from_pretrained(
            'snunlp/KR-Medium')
        self.krbert_model = BertForMaskedLM.from_pretrained(
            'snunlp/KR-Medium').eval()
        self.lists["KR-Medium"] = {
            "Tokenizer": self.krbert_tokenizer,
            "Model": self.krbert_model
        }
        print("====================================")
        print("[BERT] KR-BERT loaded")
        print("====================================")

        # BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/bert-kor-base')
        self.bert_kor_model = BertForMaskedLM.from_pretrained(
            'kykim/bert-kor-base').eval()
        self.lists["bert-kor-base"] = {
            "Tokenizer": self.bert_kor_tokenizer,
            "Model": self.bert_kor_model
        }
        print("====================================")
        print("[BERT] BERT-kor-base loaded")
        print("====================================")

        # ALBERT
        from transformers import AlbertForMaskedLM
        self.albert_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/albert-kor-base')
        self.albert_model = AlbertForMaskedLM.from_pretrained(
            'kykim/albert-kor-base').eval()
        self.lists["albert-kor-base"] = {
            "Tokenizer": self.albert_tokenizer,
            "Model": self.albert_model
        }
        print("====================================")
        print("[BERT] ALBERT-kor-base loaded")
        print("====================================")

        # XLM-Roberta
        from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM
        self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained(
            'xlm-roberta-base')
        self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
            'xlm-roberta-base').eval()
        self.lists["xlm-roberta-base"] = {
            "Tokenizer": self.xlmroberta_tokenizer,
            "Model": self.xlmroberta_model
        }
        print("====================================")
        print("[BERT] XLM-Roberta-kor loaded")
        print("====================================")

        from transformers import BertTokenizerFast, EncoderDecoderModel
        self.tokenizer_bertshared = BertTokenizerFast.from_pretrained(
            "kykim/bertshared-kor-base")
        self.bertshared_model = EncoderDecoderModel.from_pretrained(
            "kykim/bertshared-kor-base")
        self.lists["bertshared-kor-base"] = {
            "Tokenizer": self.tokenizer_bertshared,
            "Model": self.bertshared_model
        }
        print("====================================")
        print("[Seq2seq + BERT] bertshared-kor-base loaded")
        print("====================================")

        # gpt3-kor-small_based_on_gpt2
        from transformers import BertTokenizerFast, GPT2LMHeadModel
        self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.model_gpt3 = GPT2LMHeadModel.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.lists["gpt3-kor-small_based_on_gpt2"] = {
            "Tokenizer": self.tokenizer_gpt3,
            "Model": self.model_gpt3
        }
        print("====================================")
        print("[GPT3] gpt3-small-based-on-gpt2 loaded")
        print("====================================")

        # electra-base-kor
        from transformers import ElectraTokenizerFast, ElectraModel
        self.tokenizer_electra = ElectraTokenizerFast.from_pretrained(
            "kykim/electra-kor-base")
        self.electra_model = ElectraModel.from_pretrained(
            "kykim/electra-kor-base")
        self.lists["electra-kor-base"] = {
            "Tokenizer": self.tokenizer_electra,
            "Model": self.electra_model
        }
        print("====================================")
        print("[ELECTRA] electra-kor-base loaded")
        print("====================================")

        from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering
        self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.lists["electra-kor-QA"] = {
            "Tokenizer": self.electra_tokenizer_QA,
            "Model": self.electra_model_QA
        }
        print("====================================")
        print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded")
        print("====================================")
Esempio n. 9
0
def train_model(dir_tokenizer: str = None,
                dir_model: str = None,
                dir_data: str = None):
    batch_size = 16
    epochs = 10

    raw_train_data, raw_eval_data = load_data(dir_data)
    train_squad_examples = create_squad_examples(raw_train_data,
                                                 "Creating training points",
                                                 dir_tokenizer)
    x_train, y_train = create_inputs_targets(train_squad_examples)

    eval_squad_examples = create_squad_examples(raw_eval_data,
                                                "Creating evaluation points",
                                                dir_tokenizer)
    x_eval, y_eval = create_inputs_targets(eval_squad_examples)

    train_data = TensorDataset(torch.tensor(x_train[0], dtype=torch.int64),
                               torch.tensor(x_train[1], dtype=torch.float),
                               torch.tensor(x_train[2], dtype=torch.int64),
                               torch.tensor(y_train[0], dtype=torch.int64),
                               torch.tensor(y_train[1], dtype=torch.int64))
    print(f"{len(train_data)} training points created.")
    train_sampler = RandomSampler(train_data)
    train_data_loader = DataLoader(train_data,
                                   sampler=train_sampler,
                                   batch_size=batch_size)

    eval_data = TensorDataset(torch.tensor(x_eval[0], dtype=torch.int64),
                              torch.tensor(x_eval[1], dtype=torch.float),
                              torch.tensor(x_eval[2], dtype=torch.int64),
                              torch.tensor(y_eval[0], dtype=torch.int64),
                              torch.tensor(y_eval[1], dtype=torch.int64))
    print(f"{len(eval_data)} evaluation points created.")
    eval_sampler = SequentialSampler(eval_data)
    validation_data_loader = DataLoader(eval_data,
                                        sampler=eval_sampler,
                                        batch_size=batch_size)

    model = ElectraForQuestionAnswering.from_pretrained(dir_model)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = torch.optim.Adam(lr=1e-5,
                                 betas=(0.9, 0.98),
                                 eps=1e-9,
                                 params=optimizer_grouped_parameters)
    for epoch in range(1, epochs + 1):
        # ============================================ TRAINING ============================================================
        print("Training epoch ", str(epoch))
        training_pbar = tqdm(total=len(train_data),
                             position=0,
                             leave=True,
                             file=sys.stdout,
                             bar_format="{l_bar}%s{bar}%s{r_bar}" %
                             (Fore.GREEN, Fore.RESET))
        model.train()
        tr_loss = 0
        nb_tr_steps = 0
        for step, batch in enumerate(train_data_loader):
            batch = tuple(t for t in batch)
            input_word_ids, input_mask, input_type_ids, start_token_idx, end_token_idx = batch
            optimizer.zero_grad()
            output = model(input_ids=input_word_ids,
                           attention_mask=input_mask,
                           token_type_ids=input_type_ids,
                           start_positions=start_token_idx,
                           end_positions=end_token_idx)
            # print(loss)
            loss = output[0]
            loss.backward()
            optimizer.step()
            tr_loss += loss.item()
            nb_tr_steps += 1
            training_pbar.update(input_word_ids.size(0))
        training_pbar.close()
        print(f"\nTraining loss={tr_loss / nb_tr_steps:.4f}")
        torch.save(model.state_dict(), "./weights_" + str(epoch) + ".pth")
        # ============================================ VALIDATION ==========================================================
        validation_pbar = tqdm(total=len(eval_data),
                               position=0,
                               leave=True,
                               file=sys.stdout,
                               bar_format="{l_bar}%s{bar}%s{r_bar}" %
                               (Fore.BLUE, Fore.RESET))
        model.eval()
        eval_examples_no_skip = [
            _ for _ in eval_squad_examples if _.skip is False
        ]
        currentIdx = 0
        count = 0
        for batch in validation_data_loader:
            batch = tuple(t for t in batch)
            input_word_ids, input_mask, input_type_ids, start_token_idx, end_token_idx = batch
            with torch.no_grad():
                output_ = model(input_ids=input_word_ids,
                                attention_mask=input_mask,
                                token_type_ids=input_type_ids)
                # print(output_.start_logits)
                start_logits, end_logits = output_.start_logits, output_.end_logits
                pred_start, pred_end = start_logits.detach().cpu().numpy(
                ), end_logits.detach().cpu().numpy()

            for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
                squad_eg = eval_examples_no_skip[currentIdx]
                currentIdx += 1
                offsets = squad_eg.context_token_to_char
                start = np.argmax(start)
                end = np.argmax(end)
                if start >= len(offsets):
                    continue
                pred_char_start = offsets[start][0]
                if end < len(offsets):
                    pred_char_end = offsets[end][1]
                    pred_ans = squad_eg.context[pred_char_start:pred_char_end]
                else:
                    pred_ans = squad_eg.context[pred_char_start:]
                normalized_pred_ans = normalize_text(pred_ans)
                normalized_true_ans = [
                    normalize_text(_) for _ in squad_eg.all_answers
                ]
                if normalized_pred_ans in normalized_true_ans:
                    count += 1
            validation_pbar.update(input_word_ids.size(0))
        acc = count / len(y_eval[0])
        validation_pbar.close()
        print(f"\nEpoch={epoch}, exact match score={acc:.2f}")
Esempio n. 10
0
from transformers import ElectraTokenizer, ElectraForQuestionAnswering, pipeline
from pprint import pprint

tokenizer = ElectraTokenizer.from_pretrained(
    "monologg/koelectra-small-v2-distilled-korquad-384")
model = ElectraForQuestionAnswering.from_pretrained(
    "monologg/koelectra-small-v2-distilled-korquad-384")

qa = pipeline("question-answering", tokenizer=tokenizer, model=model)

pprint(
    qa({
        "question":
        "한국의 대통령은 누구인가?",
        "context":
        "문재인 대통령은 28일 서울 코엑스에서 열린 ‘데뷰 (Deview) 2019’ 행사에 참석해 젊은 개발자들을 격려하면서 우리 정부의 인공지능 기본구상을 내놓았다.",
    }))
Esempio n. 11
0
def load_model_tokenizer(path):
    return ElectraForQuestionAnswering.from_pretrained(path), \
           ElectraTokenizerFast.from_pretrained(path)