Example #1
0
def load_and_predict(data_dir, model_type, pretrain_model):
    if model_type == 'bert_japanese':
        model = BertForQuestionAnswering.from_pretrained(
            'cl-tohoku/bert-base-japanese')
        tokenizer = BertJapaneseTokenizer.from_pretrained(
            'cl-tohoku/bert-base-japanese')

    if model_type == 'bert_multilingual':
        model = BertForQuestionAnswering.from_pretrained(
            'bert-base-multilingual-cased')
        tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased', tokenize_chinese_chars=False)

    if model_type == 'albert':
        model = AlbertForQuestionAnswering.from_pretrained(
            'ALINEAR/albert-japanese-v2')
        tokenizer = AlbertTokenizer.from_pretrained(
            'ALINEAR/albert-japanese-v2')

    test_data = TestData(data_dir, TAG)
    testset = QADataset(test_data.examples, "test", tokenizer=tokenizer)
    testloader = DataLoader(testset, batch_size=4, collate_fn=collate_fn)

    model = model.to(device)
    model.load_state_dict(torch.load(pretrain_model))

    prediction = predict(model, testloader, device, tokenizer)
    prediction = func(data_dir, prediction)
    print('finish loading and predicting from {}!'.format(pretrain_model))
    return prediction  #prediction dictionary
Example #2
0
def main():
   # Set seed
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda")

    config = BertConfig.from_pretrained('bert-base-cased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased',do_lower_case=True,)
    #the nn.module BertForQuestionAnswering has a single untrained layer qa_output: Linear(hidden_size,2) on top of the trained BERT-base.
    model = BertForQuestionAnswering.from_pretrained('bert-base-cased',config=config,)

    model.to(device)

    max_seq_length=384

    train_dataset = load_and_cache_examples(tokenizer, is_training=True)[0]

    # Training
    global_step, ave_loss = train(train_dataset, model, tokenizer)
    print(" global_step = %s, average loss = %s", global_step, tr_loss / global_step)

    # Save the trained model and the tokenizer
    output_dir = 'output/'

    # Create output directory if needed
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model checkpoint to %s", output_dir)
    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Load a trained model and vocabulary that you have fine-tuned
    model = BertForQuestionAnswering.from_pretrained(output_dir)
    tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=True)
    model.to(device)

    # Evaluate
    results = evaluate(model, tokenizer)
    print("Results: {}".format(results))

    return result
Example #3
0
def load_qa_model():
    model = BertForQuestionAnswering.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')
    #Tokenizer
    tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')
    return model, tokenizer
Example #4
0
def download_model():
    if (not Path("model_downloaded").is_file()
            or not Path("usecase_indicator.h5").is_file()):

        url = "https://b0ykepubbucket.s3-eu-west-1.amazonaws.com/usecase_indicator.h5"
        r = requests.get(url, stream=True)
        chunk_progress = 0
        with open("usecase_indicator.h5", "wb") as modelfile:
            for chunk in r.iter_content(chunk_size=8388608):
                if chunk:
                    modelfile.write(chunk)
                    chunk_progress += 1
                    print(
                        f"Downloading model 1/2 in background: {chunk_progress*8}MB"
                    )
                    sys.stdout.flush()
            else:
                open("model_downloaded", "w").close()

    if (not Path("modelqna_downloaded").is_file()
            or not Path("./BertLSquad/pytorch_model.bin").is_file()):
        print(f"Started model 2/2 download in background")
        sys.stdout.flush()
        model = BertForQuestionAnswering.from_pretrained(
            'bert-large-uncased-whole-word-masking-finetuned-squad')
        model.save_pretrained("./BertLSquad")
        open("modelqna_downloaded", "w").close()
        print("Model 2/2 download completed")
        sys.stdout.flush()

    return
Example #5
0
def train():
    with msg.loading("   Loading BERT"):
        TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased')
        MODEL = BertForQuestionAnswering.from_pretrained(
            'bert-large-uncased-whole-word-masking-finetuned-squad')
    msg.good("   BERT loaded")

    articles_dir = os.path.join(SCRIPT_PATH,
                                '../data/raw/CORD-19-research-challenge/')
    articles_folders = [
        'biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/',
        'comm_use_subset/comm_use_subset/pdf_json/',
        'comm_use_subset/comm_use_subset/pmc_json/',
        'noncomm_use_subset/noncomm_use_subset/pdf_json/',
        'noncomm_use_subset/noncomm_use_subset/pmc_json/',
        'custom_license/custom_license/pdf_json/',
        'custom_license/custom_license/pmc_json/'
    ]
    meta_path = articles_dir + 'metadata.csv'

    with msg.loading("   Loading publications"):
        start = time.time()
        data_text, index2paperID, index2paperPath = get_data_texts(
            articles_dir, articles_folders, meta_path)
    msg.good("   Publications loaded - Took {:.2f}s".format(time.time() -
                                                            start))

    covid_q = QuestionCovid(TOKENIZER, MODEL, index2paperID, index2paperPath)
    covid_q.fit(data_text)
    return covid_q
 def _get_question_answering(self):
     """
     Initializes the BertForQuestionAnswering transformer
     NOTE: This uses the bert-large-uncased-whole-word-masking-finetuned-squad pretraining for best results.
     """
     self.qa = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
     self.qa.eval()
 def __init__(self):
     self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     self.model_name = "nyust-eb210/braslab-bert-drcd-384"
     self.tokenizer = BertTokenizerFast.from_pretrained(self.model_name)
     self.model = BertForQuestionAnswering.from_pretrained(self.model_name).to(
         self.device
     )
Example #8
0
def answergen_bert(context, question):
    tokenizer = BertTokenizer.from_pretrained(
        'csarron/bert-base-uncased-squad-v1')
    model = BertForQuestionAnswering.from_pretrained(
        'csarron/bert-base-uncased-squad-v1')
    #tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad',return_token_type_ids = True)
    #model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    encoding = tokenizer.encode_plus(question, context)

    input_ids, attention_mask = encoding["input_ids"], encoding[
        "attention_mask"]

    start_scores, end_scores = model(torch.tensor([input_ids]),
                                     attention_mask=torch.tensor(
                                         [attention_mask]))

    ans_tokens = input_ids[torch.argmax(start_scores[
        0, 1:]):torch.argmax(end_scores[0, 1:]) + 1]
    answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens)

    print("\nQuestion ", question)
    #print ("\nAnswer Tokens: ")
    #print (answer_tokens)

    answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)
    print("\nAnswer : ", answer_tokens_to_string)
    return answer_tokens_to_string
Example #9
0
def get_answer_using_bert(question, reference_text):
    # Load fine-tuned model for QA
    bert_model = BertForQuestionAnswering.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')

    # Load Vocab as well
    bert_tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')

    # Apply bert_tokenizer on input text
    input_ids = bert_tokenizer.encode(question, reference_text)
    input_tokens = bert_tokenizer.convert_ids_to_tokens(input_ids)

    # Search index of first [SEP] token
    sep_location = input_ids.index(bert_tokenizer.sep_token_id)
    first_seg_len, second_seg_len = sep_location + 1, len(input_ids) - (
        sep_location + 1)
    seg_embedding = [0] * first_seg_len + [1] * second_seg_len

    # Run our example on model
    model_scores = bert_model(torch.tensor([input_ids]),
                              token_type_ids=torch.tensor([seg_embedding]))
    ans_start_loc, ans_end_loc = torch.argmax(model_scores[0]), torch.argmax(
        model_scores[1])
    result = ' '.join(input_tokens[ans_start_loc:ans_end_loc + 1])

    # Return final result
    result = result.replace(' ##', '')
    return result
Example #10
0
    def __init__(self,
                 qa_path,
                 relations_filepath,
                 data_directory,
                 batch_size,
                 must_choose_answer,
                 device,
                 trained_to_reject,
                 calculate_single_error=True):
        self.trained_to_reject = trained_to_reject
        self.qa_path = qa_path  # path to qa weights
        self.relations_filepath = relations_filepath  # path to relations file
        self.data_directory = data_directory  # data directory path
        self.tokenizer = BertTokenizer.from_pretrained(
            'bert-large-cased')  # tokenizer
        self.model = BertForQuestionAnswering.from_pretrained(
            qa_path)  # Load the model
        self.model.to(device)
        self.device = device

        self.batch_size = batch_size
        self.must_choose_answer = must_choose_answer  # For datasets where there is always an answer, setting this to true will ensure that QA models that can return "answer doesn't exist" will always return a span in the context
        self.total_samples = 0
        if calculate_single_error:
            self.se_list = []
        else:
            self.se_list = None
Example #11
0
def load_model(model_path):
    model = BertForQuestionAnswering.from_pretrained(model_path)
    model.to(device)
    model.eval()
    model.zero_grad()

    return model
Example #12
0
 def __init__(self,model_configs):
     self.model_configs=model_configs
     self.pretrained_model = BertForQuestionAnswering.from_pretrained(self.model_configs['pretrained_model_name'],
                                                          cache_dir=self.model_configs['cache_dir'],output_attentions=True)
     
     
     self.tokenizer = BertTokenizer.from_pretrained(self.model_configs['tokenizer_name'])
Example #13
0
async def main(message: types.Message):
    import torch
    model = BertForQuestionAnswering.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')
    tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')
    Question = 'The purpose of the NewsQA dataset'
    paragrah = 'With massive volumes of written text being produced every second, how do we make sure that we have the most recent and relevant information available to us? Microsoft research Montreal is tackling this problem by building AI systems that can read and comprehend large volumes of complex text in real-time. The purpose of the NewsQA dataset is to help the research community build algorithms that are capable of answering questions requiring human-level comprehension and reasoning skills.'
    encoding = tokenizer.encode_plus(text=Question,
                                     text_pair=paragrah,
                                     add_special=True)
    # token embedding
    inputs = encoding['input_ids']
    #3 segment embedgin
    sentence_embed = encoding['token_type_ids']
    # input tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs)
    start_scores, end_scores = model(input_ids=torch.tensor([inputs]),
                                     token_type_ids=torch.tensor(
                                         [sentence_embed]),
                                     return_dict=False)
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    answer = ' '.join(tokens[start_index:end_index + 1])

    await message.reply(text=answer)
Example #14
0
def train(args):
    model = BertForQuestionAnswering.from_pretrained(PRETRAINED_MODEL_PATH)
    tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_PATH)
    model.resize_token_embeddings(len(tokenizer))
    datasets = CMRC2018(args=args, tokenizer=tokenizer)()

    training_args = TrainingArguments(
        output_dir=args.model_path,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        warmup_steps=args.warmup_steps,
        remove_unused_columns=False,
        logging_dir=args.log_path,
        num_train_epochs=args.n_epochs,
        dataloader_num_workers=args.num_workers,
        evaluation_strategy='epoch')
    print(
        f"Train dataset size: {len(datasets['train'])}, Validation dataset size: {len(datasets['validation'])}"
    )
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=datasets['train'],
                      eval_dataset=datasets['validation'])
    trainer.train()
    trainer.save_model()
    pass
Example #15
0
 def load_model(self, model_path: str, do_lower_case=False):
     config = BertConfig.from_pretrained(model_path + "/bert_config.json")
     tokenizer = BertTokenizer.from_pretrained(
         model_path, do_lower_case=do_lower_case)
     model = BertForQuestionAnswering.from_pretrained(
         model_path, from_tf=False, config=config)
     return model, tokenizer
Example #16
0
def nlpQuestion(question):
    print("nlptriggered")
    text = """
    Coronaviruses are a large family of viruses that can cause illness in animals or humans. In humans, several coronaviruses are known to cause respiratory infections ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS) and Severe Acute Respiratory Syndrome (SARS). COVID-19 is a virus of the same family with a first recorded outbreak in Wuhan, China, in December 2019. The most common symptoms of COVID-19 are fever, tiredness, and dry cough. Other symptoms include aches and pains, nasal congestion, runny nose, sore throat or diarrhea. These symptoms are usually mild and begin gradually. Some people become infected but don’t develop any symptoms and don't feel unwell. Most people (about 80%) recover from the disease without needing special treatment. Around 1 out of every 6 people who gets COVID-19 becomes seriously ill and develops difficulty breathing. Older people, and those with underlying medical problems like high blood pressure, heart problems or diabetes, are more likely to develop serious illness. People with fever, cough and difficulty breathing should seek medical attention. People can catch COVID-19 from others who have the virus. The disease can spread from person to person through small droplets from the nose or mouth which are spread when a person with COVID-19 coughs or exhales. These droplets land on objects and surfaces around the person. Other people then catch COVID-19 by touching these objects or surfaces, then touching their eyes, nose or mouth. People can also catch COVID-19 if they breathe in droplets from a person with COVID-19 who coughs out or exhales droplets. This is why it is important to stay more than 1 meter (3 feet) away from a person who is sick.
    Studies to date suggest that the virus that causes COVID-19 is mainly transmitted through contact with respiratory droplets rather than through the air. 
    There have been 105000 confirmed cases of coronovirus in the world, with 3100 deaths. There are only 32 confirmed cases in Lebanon. If you are experiencing symptoms, call MOPH on 1214 or 76592699.
    """

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForQuestionAnswering.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')
    print("nlptriggered")

    input_ids = tokenizer.encode(question, text)
    token_type_ids = [
        0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))
    ]
    start_scores, end_scores = model(torch.tensor([input_ids]),
                                     token_type_ids=torch.tensor(
                                         [token_type_ids]))
    print("nlptriggered")

    all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = ' '.join(
        all_tokens[torch.argmax(start_scores):torch.argmax(end_scores) +
                   1]).replace(' ##', '')

    print(answer)
    return answer
Example #17
0
def configure_tokenizer_model_bert(args, logger, is_preprocess=False):
    logger.info("***** Loading tokenizer *****")
    tokenizer = BertTokenizer.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case)

    # logger.info("Loading configuration from {}".format(args.cache_dir))
    logger.info("***** Loading configuration from {} ******".format(
        args.init_dir))
    config = BertConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        cache_dir=args.init_dir)
    config.vocab_size = len(tokenizer.vocab)

    logger.info("***** Loading pretrained model from {} *****".format(
        args.init_dir))
    if is_preprocess:
        model = AutoModel.from_pretrained(args.model_name_or_path,
                                          config=config,
                                          cache_dir=args.init_dir)
    else:
        model = BertForQuestionAnswering.from_pretrained(
            args.init_dir, config=config, cache_dir=args.init_dir)

    return tokenizer, model
Example #18
0
    def create_graphics(self, url_base, model_card_path):
        pruned_heads = self.checkpoint_info["config"].get("pruned_heads")
        ret = {}
        if pruned_heads is not None:
            pruning_info_plotter = PruningInfoBokehPlotter(
                "pruning_info", self.JS_PATH)
            fig, js, html = pruning_info_plotter.run(layer_count=12,
                                                     pruned_heads=pruned_heads,
                                                     heads_count=12)
            ret["pruning_info"] = dict(js=js, html=html)

        density_plotter = DensityBokehPlotter("density", self.JS_PATH)

        model = BertForQuestionAnswering.from_pretrained(self.git_path)

        fig, js, html = density_plotter.run(model=model,
                                            dest_path=model_card_path /
                                            "images",
                                            url_base=url_base + "/images")
        ret["density_info"] = dict(js=js, html=html)

        from bokeh.io import export_png

        export_png(fig, filename="/tmp/plot.png")

        return ret
Example #19
0
 def load_model(self):
     config = BertConfig.from_pretrained(self.c_path)
     self.model = BertForQuestionAnswering.from_pretrained(
         'bert-base-uncased', config=config)
     self.model.to(self.device)
     self.model.eval()
     return self.model
Example #20
0
def answer_question(question, answer_text, model_name=None, tokenizer_name=None):
    """
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer.

    Parameters
    ----------
    question : str
    answer_text : str
    model : str
    tokenizer : str

    Return
    -------
    answer : str
    """
    # ======== Model & Tokenizer (default: bert-large finetuned squad ver.1)========
    if model_name is None:
        model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'
    if tokenizer_name is None:
        tokenizer_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'
    
    model = BertForQuestionAnswering.from_pretrained(model_name)
    tokenizer = BertTokenizer.from_pretrained(tokenizer_name)

    # ======== Tokenize ========
    input_ids = tokenizer.encode(question, answer_text)

    # Report how long the input sequence is.
    # print(f"Query has {len(input_ids):,} tokens.\n")

    # ======== Set Segment IDs ========
    sep_index = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_index + 1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    start_scores, end_scores = model(
        torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids])
        )

    # ======== Reconstruct Answer ========
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = tokens[answer_start]
    for i in range(answer_start + 1, answer_end + 1):
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        else:
            answer += ' ' + tokens[i]

    return answer
Example #21
0
 def __init__(self, model_dir, cache_dir):
     # assert the config file, pretrained model existing in the dir
     self.config = BertConfig.from_pretrained(model_dir,
                                              cache_dir=cache_dir)
     self.tokenizer = BertTokenizer.from_pretrained(model_dir,
                                                    cache_dir=cache_dir)
     self.model = BertForQuestionAnswering.from_pretrained(
         model_dir, cache_dir=cache_dir)
Example #22
0
    def __init__(self,
                 model: str = None,
                 lowercase=True,
                 tokenizer=BertTokenizer):

        self.lowercase = lowercase
        self.tokenizer = tokenizer.from_pretrained(model)
        self.model = BertForQuestionAnswering.from_pretrained(model)
Example #23
0
 def __init__(
     self,
     pre_trained_name='bert-large-uncased-whole-word-masking-finetuned-squad'
 ):
     self.pre_trained_name = pre_trained_name
     self.model = BertForQuestionAnswering.from_pretrained(
         self.pre_trained_name)
     self.tokenizer = BertTokenizer.from_pretrained(self.pre_trained_name)
Example #24
0
	def __init__(self):

		# BERT Finetuned on SQUAD

		self.bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
		self.squad_finetuned_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
		self.squad_finetuned_model = self.squad_finetuned_model.eval()
		self.squad_finetuned_model = self.squad_finetuned_model.to(device)
Example #25
0
def init_bert():
    global bert_model
    global bert_tokenizer
    from transformers import BertForQuestionAnswering
    from transformers import BertTokenizer
    bert_model = BertForQuestionAnswering.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')
    bert_tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')
def main():
    parser = get_parser()
    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Set device
    args.device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    logging.getLogger("transformers.generation_utils").setLevel(logging.ERROR)

    # Load pretrained question generation model and tokenizer
    GPT2_tokenizer = GPT2Tokenizer.from_pretrained(
        args.question_generation_model, do_lower_case=args.do_lower_case)
    GPT2_model = GPT2LMHeadModel.from_pretrained(
        args.question_generation_model)
    GPT2_model.prepare_inputs_for_generation = prepare_inputs_for_generation
    GPT2_model.eval()
    GPT2_model.to(args.device)

    BERT_tokenizer = BertTokenizer.from_pretrained(
        args.answering_model, do_lower_case=args.do_lower_case)
    BERT_model = BertForQuestionAnswering.from_pretrained(args.answering_model)
    BERT_model.eval()
    BERT_model.to(args.device)

    logging.info("Parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
    # remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            from apex import amp
            amp.register_half_function(torch, "einsum")
            GPT2_model = amp.initialize(GPT2_model,
                                        opt_level=args.fp16_opt_level)
            BERT_model = amp.initialize(BERT_model,
                                        opt_level=args.fp16_opt_level)
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    generate(args, GPT2_tokenizer, GPT2_model, BERT_tokenizer, BERT_model)
Example #27
0
def model_pick(id):
  if (id == 0):
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
  if (id == 1):
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
    model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

  return tokenizer, model
Example #28
0
 def __init__(
         self,
         pretrained='bert-large-uncased-whole-word-masking-finetuned-squad'
 ):
     self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
     self.QA_MODEL = Bert4QA.from_pretrained(pretrained)
     self.QA_MODEL.to(self.torch_device)
     self.QA_MODEL.eval()
     self.QA_TOKENIZER = BertTokenizer.from_pretrained(pretrained)
Example #29
0
 def __init__(self, args):
     self.args = args
     self.model = BertForQuestionAnswering.from_pretrained(self.args.model_path).to(self.args.device)
     self.tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
     self.dataset = CMRC2018(args=args, tokenizer=self.tokenizer)()
     self.validation_dataloader = DataLoader(self.dataset['validation'],
                                             batch_size=self.args.batch_size,
                                             collate_fn=custom_collate,
                                             num_workers=self.args.num_workers)
     pass
    def load_model(self):
        # Load a pretrained model that has been fine-tuned
        config = BertConfig.from_pretrained(self.model_type, output_hidden_states=True, cache_dir=self.cache_dir)

        pretrained_weights = torch.load(self.model_path, map_location=torch.device(self.device))
        model = BertForQuestionAnswering.from_pretrained(self.model_type,
                                                         state_dict=pretrained_weights,
                                                         config=config,
                                                         cache_dir=self.cache_dir)
        return model