Exemple #1
0
def load_and_predict(data_dir, model_type, pretrain_model):
    if model_type == 'bert_japanese':
        model = BertForQuestionAnswering.from_pretrained(
            'cl-tohoku/bert-base-japanese')
        tokenizer = BertJapaneseTokenizer.from_pretrained(
            'cl-tohoku/bert-base-japanese')

    if model_type == 'bert_multilingual':
        model = BertForQuestionAnswering.from_pretrained(
            'bert-base-multilingual-cased')
        tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased', tokenize_chinese_chars=False)

    if model_type == 'albert':
        model = AlbertForQuestionAnswering.from_pretrained(
            'ALINEAR/albert-japanese-v2')
        tokenizer = AlbertTokenizer.from_pretrained(
            'ALINEAR/albert-japanese-v2')

    test_data = TestData(data_dir, TAG)
    testset = QADataset(test_data.examples, "test", tokenizer=tokenizer)
    testloader = DataLoader(testset, batch_size=4, collate_fn=collate_fn)

    model = model.to(device)
    model.load_state_dict(torch.load(pretrain_model))

    prediction = predict(model, testloader, device, tokenizer)
    prediction = func(data_dir, prediction)
    print('finish loading and predicting from {}!'.format(pretrain_model))
    return prediction  #prediction dictionary
Exemple #2
0
def task_3():
    # 任务三:句子预测任务
    question, text = "里昂是谁", "里昂是一个杀手"
    sample = (question, text)

    tokenizer = BertTokenizer.from_pretrained(bert_path)
    sen_code = tokenizer.batch_encode_plus(
        [sample])  # 上下句结合可以这样传参 List[Tuple[str, str]]
    tokens_tensor = torch.tensor(sen_code["input_ids"])
    segments_tensor = torch.tensor(sen_code["token_type_ids"])

    model_config = BertConfig.from_pretrained(bert_path)
    # model_config.num_labels = 2  # 最终有两个输出,初始位置和结束位置
    # model = BertForQuestionAnswering.from_pretrained(bert_path)  # 这是一种加载方式
    model = BertForQuestionAnswering(model_config)  # 这是另一种加载方式

    model.eval()
    outputs = model(tokens_tensor, segments_tensor)
    start_pos, end_pos = outputs.start_logits, outputs.end_logits

    for idx, (start, end) in enumerate(
            zip(start_pos.argmax(axis=1), end_pos.argmax(axis=1))):
        all_tokens = tokenizer.convert_ids_to_tokens(
            sen_code["input_ids"][idx])  # 进行逆编码,得到原始的token
        print(
            all_tokens
        )  # ['[CLS]', '里', '昂', '是', '谁', '[SEP]', '里', '昂', '是', '一', '个', '杀', '手', '[SEP]']
        if start <= end:
            answer = " ".join(all_tokens[start:end + 1])  # 对输出的答案进行解码的过程
            # 每次执行的结果不一致,这里因为没有经过微调,所以效果不是很好,输出结果不佳,下面的输出是其中的一种。
            print(answer)  # 一 个 杀 手 [SEP]
        else:
            print("预测的有问题哦!")
Exemple #3
0
    def __init__(self, args):
        print("Loading BERT configs...")
        with open("bert_config.json") as f:
            config_json = json.load(f)

        config = BertConfig(
            attention_probs_dropout_prob=config_json[
                "attention_probs_dropout_prob"],
            hidden_act=config_json["hidden_act"],
            hidden_dropout_prob=config_json["hidden_dropout_prob"],
            hidden_size=config_json["hidden_size"],
            initializer_range=config_json["initializer_range"],
            intermediate_size=config_json["intermediate_size"],
            max_position_embeddings=config_json["max_position_embeddings"],
            num_attention_heads=config_json["num_attention_heads"],
            num_hidden_layers=config_json["num_hidden_layers"],
            type_vocab_size=config_json["type_vocab_size"],
            vocab_size=config_json["vocab_size"])

        print("Loading PyTorch model...")
        self.model = BertForQuestionAnswering(config)
        self.model.eval()
        self.model.cuda()
        self.model.load_state_dict(
            torch.load(
                "build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch"))

        print("Constructing SUT...")
        self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries,
                                   self.process_latencies)
        print("Finished constructing SUT.")

        self.qsl = get_squad_QSL(args.max_examples)
Exemple #4
0
def load_from_tf(config, tf_path):
    model = BertForQuestionAnswering(config)
    model.classifier = model.qa_outputs

    # This part is copied from HuggingFace Transformers with a fix to bypass an error
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    for name, shape in init_vars:
        # print("Loading TF weight {} with shape {}".format(name, shape))
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        arrays.append(array)

    for name, array in zip(names, arrays):
        name = name.split("/")
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
            print("Skipping {}".format("/".join(name)))
            continue
        pointer = model
        for m_name in name:
            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
                scope_names = re.split(r"_(\d+)", m_name)
            else:
                scope_names = [m_name]
            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                pointer = getattr(pointer, "bias")
            elif scope_names[0] == "output_weights":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "squad":
                pointer = getattr(
                    pointer, "classifier")  # This line is causing the issue
            else:
                try:
                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    print("Skipping {}".format("/".join(name)))
                    continue
            if len(scope_names) >= 2:
                num = int(scope_names[1])
                pointer = pointer[num]
        if m_name[-11:] == "_embeddings":
            pointer = getattr(pointer, "weight")
        elif m_name == "kernel":
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        print("Initialize PyTorch weight {}".format(name))
        pointer.data = torch.from_numpy(array)

    model.qa_outputs = model.classifier
    del model.classifier
    return model
Exemple #5
0
 def __init__(self, bert_dir, args):
     super(BERTPretrainedMRC, self).__init__()
     if args.load_pretrainedBERT:
         self.bert = BertForQuestionAnswering.from_pretrained(bert_dir)
     else:
         self.bert_config = BertQueryNerConfig.from_pretrained(
             bert_dir,
             hidden_dropout_prob=args.bert_dropout,
             attention_probs_dropout_prob=args.bert_dropout,
             mrc_dropout=args.mrc_dropout)
         self.bert = BertForQuestionAnswering(self.bert_config)
Exemple #6
0
    def load(self, fname=None):
        if fname is not None:
            self.load_path = fname

        if self.pretrained_bert and not Path(self.pretrained_bert).is_file():
            self.model = BertForQuestionAnswering.from_pretrained(
                self.pretrained_bert,
                output_attentions=False,
                output_hidden_states=False)
        elif self.bert_config_file and Path(self.bert_config_file).is_file():
            self.bert_config = BertConfig.from_json_file(
                str(expand_path(self.bert_config_file)))

            if self.attention_probs_keep_prob is not None:
                self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob
            if self.hidden_keep_prob is not None:
                self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob
            self.model = BertForQuestionAnswering(config=self.bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")

        self.model.to(self.device)
        self.optimizer = getattr(torch.optim, self.optimizer_name)(
            self.model.parameters(), **self.optimizer_parameters)
        if self.lr_scheduler_name is not None:
            self.lr_scheduler = getattr(torch.optim.lr_scheduler,
                                        self.lr_scheduler_name)(
                                            self.optimizer,
                                            **self.lr_scheduler_parameters)

        if self.load_path:
            logger.info(f"Load path {self.load_path} is given.")
            if isinstance(self.load_path,
                          Path) and not self.load_path.parent.is_dir():
                raise ConfigError("Provided load path is incorrect!")

            weights_path = Path(self.load_path.resolve())
            weights_path = weights_path.with_suffix(f".pth.tar")
            if weights_path.exists():
                logger.info(f"Load path {weights_path} exists.")
                logger.info(
                    f"Initializing `{self.__class__.__name__}` from saved.")

                # now load the weights, optimizer from saved
                logger.info(f"Loading weights from {weights_path}.")
                checkpoint = torch.load(weights_path, map_location=self.device)
                self.model.load_state_dict(checkpoint["model_state_dict"])
                self.optimizer.load_state_dict(
                    checkpoint["optimizer_state_dict"])
                self.epochs_done = checkpoint.get("epochs_done", 0)
            else:
                logger.info(
                    f"Init from scratch. Load path {weights_path} does not exist."
                )
def construct_qa_transformer(options: KaggleEvaluationOptions) -> Reranker:
    # We load a sequence classification model first -- again, as a workaround. Refactor.
    try:
        model = AutoModelForSequenceClassification.from_pretrained(options.model_name)
    except OSError:
        model = AutoModelForSequenceClassification.from_pretrained(options.model_name, from_tf=True)
    fixed_model = BertForQuestionAnswering(model.config)
    fixed_model.qa_outputs = model.classifier
    fixed_model.bert = model.bert
    device = torch.device(options.device)
    model = fixed_model.to(device).eval()
    tokenizer = AutoTokenizer.from_pretrained(options.tokenizer_name, do_lower_case=options.do_lower_case)
    return QuestionAnsweringTransformerReranker(model, tokenizer)
Exemple #8
0
def model_fn(model_dir):
    config_path = model_dir + '/config_file.json'
    model_path = model_dir + '/pytorch_model.bin'

    config = BertConfig.from_json_file(config_path)
    model = BertForQuestionAnswering(config)

    # Checks GPU state
    model.load_state_dict(
        torch.load(model_path,
                   map_location=torch.device(
                       'cuda' if torch.cuda.is_available() else 'cpu')))
    return model
def main():
   # Set seed
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda")

    config = BertConfig.from_pretrained('bert-base-cased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased',do_lower_case=True,)
    #the nn.module BertForQuestionAnswering has a single untrained layer qa_output: Linear(hidden_size,2) on top of the trained BERT-base.
    model = BertForQuestionAnswering.from_pretrained('bert-base-cased',config=config,)

    model.to(device)

    max_seq_length=384

    train_dataset = load_and_cache_examples(tokenizer, is_training=True)[0]

    # Training
    global_step, ave_loss = train(train_dataset, model, tokenizer)
    print(" global_step = %s, average loss = %s", global_step, tr_loss / global_step)

    # Save the trained model and the tokenizer
    output_dir = 'output/'

    # Create output directory if needed
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model checkpoint to %s", output_dir)
    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Load a trained model and vocabulary that you have fine-tuned
    model = BertForQuestionAnswering.from_pretrained(output_dir)
    tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=True)
    model.to(device)

    # Evaluate
    results = evaluate(model, tokenizer)
    print("Results: {}".format(results))

    return result
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path:str, bert_config_file:str, pytorch_dump_path:str)->None:
    """
    Updated function to convert a Tensorflow checkpoint to compatible model.
    """
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    
    model = BertForQuestionAnswering(config)
    # Load weights from tf checkpoint
    load_tf_weights_in_bert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
Exemple #11
0
def nlpQuestion(question):
    print("nlptriggered")
    text = """
    Coronaviruses are a large family of viruses that can cause illness in animals or humans. In humans, several coronaviruses are known to cause respiratory infections ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS) and Severe Acute Respiratory Syndrome (SARS). COVID-19 is a virus of the same family with a first recorded outbreak in Wuhan, China, in December 2019. The most common symptoms of COVID-19 are fever, tiredness, and dry cough. Other symptoms include aches and pains, nasal congestion, runny nose, sore throat or diarrhea. These symptoms are usually mild and begin gradually. Some people become infected but don’t develop any symptoms and don't feel unwell. Most people (about 80%) recover from the disease without needing special treatment. Around 1 out of every 6 people who gets COVID-19 becomes seriously ill and develops difficulty breathing. Older people, and those with underlying medical problems like high blood pressure, heart problems or diabetes, are more likely to develop serious illness. People with fever, cough and difficulty breathing should seek medical attention. People can catch COVID-19 from others who have the virus. The disease can spread from person to person through small droplets from the nose or mouth which are spread when a person with COVID-19 coughs or exhales. These droplets land on objects and surfaces around the person. Other people then catch COVID-19 by touching these objects or surfaces, then touching their eyes, nose or mouth. People can also catch COVID-19 if they breathe in droplets from a person with COVID-19 who coughs out or exhales droplets. This is why it is important to stay more than 1 meter (3 feet) away from a person who is sick.
    Studies to date suggest that the virus that causes COVID-19 is mainly transmitted through contact with respiratory droplets rather than through the air. 
    There have been 105000 confirmed cases of coronovirus in the world, with 3100 deaths. There are only 32 confirmed cases in Lebanon. If you are experiencing symptoms, call MOPH on 1214 or 76592699.
    """

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForQuestionAnswering.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')
    print("nlptriggered")

    input_ids = tokenizer.encode(question, text)
    token_type_ids = [
        0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))
    ]
    start_scores, end_scores = model(torch.tensor([input_ids]),
                                     token_type_ids=torch.tensor(
                                         [token_type_ids]))
    print("nlptriggered")

    all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = ' '.join(
        all_tokens[torch.argmax(start_scores):torch.argmax(end_scores) +
                   1]).replace(' ##', '')

    print(answer)
    return answer
Exemple #12
0
def download_model():
    if (not Path("model_downloaded").is_file()
            or not Path("usecase_indicator.h5").is_file()):

        url = "https://b0ykepubbucket.s3-eu-west-1.amazonaws.com/usecase_indicator.h5"
        r = requests.get(url, stream=True)
        chunk_progress = 0
        with open("usecase_indicator.h5", "wb") as modelfile:
            for chunk in r.iter_content(chunk_size=8388608):
                if chunk:
                    modelfile.write(chunk)
                    chunk_progress += 1
                    print(
                        f"Downloading model 1/2 in background: {chunk_progress*8}MB"
                    )
                    sys.stdout.flush()
            else:
                open("model_downloaded", "w").close()

    if (not Path("modelqna_downloaded").is_file()
            or not Path("./BertLSquad/pytorch_model.bin").is_file()):
        print(f"Started model 2/2 download in background")
        sys.stdout.flush()
        model = BertForQuestionAnswering.from_pretrained(
            'bert-large-uncased-whole-word-masking-finetuned-squad')
        model.save_pretrained("./BertLSquad")
        open("modelqna_downloaded", "w").close()
        print("Model 2/2 download completed")
        sys.stdout.flush()

    return
Exemple #13
0
def configure_tokenizer_model_bert(args, logger, is_preprocess=False):
    logger.info("***** Loading tokenizer *****")
    tokenizer = BertTokenizer.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case)

    # logger.info("Loading configuration from {}".format(args.cache_dir))
    logger.info("***** Loading configuration from {} ******".format(
        args.init_dir))
    config = BertConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        cache_dir=args.init_dir)
    config.vocab_size = len(tokenizer.vocab)

    logger.info("***** Loading pretrained model from {} *****".format(
        args.init_dir))
    if is_preprocess:
        model = AutoModel.from_pretrained(args.model_name_or_path,
                                          config=config,
                                          cache_dir=args.init_dir)
    else:
        model = BertForQuestionAnswering.from_pretrained(
            args.init_dir, config=config, cache_dir=args.init_dir)

    return tokenizer, model
Exemple #14
0
def answergen_bert(context, question):
    tokenizer = BertTokenizer.from_pretrained(
        'csarron/bert-base-uncased-squad-v1')
    model = BertForQuestionAnswering.from_pretrained(
        'csarron/bert-base-uncased-squad-v1')
    #tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad',return_token_type_ids = True)
    #model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    encoding = tokenizer.encode_plus(question, context)

    input_ids, attention_mask = encoding["input_ids"], encoding[
        "attention_mask"]

    start_scores, end_scores = model(torch.tensor([input_ids]),
                                     attention_mask=torch.tensor(
                                         [attention_mask]))

    ans_tokens = input_ids[torch.argmax(start_scores[
        0, 1:]):torch.argmax(end_scores[0, 1:]) + 1]
    answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens)

    print("\nQuestion ", question)
    #print ("\nAnswer Tokens: ")
    #print (answer_tokens)

    answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)
    print("\nAnswer : ", answer_tokens_to_string)
    return answer_tokens_to_string
 def __init__(self,model_configs):
     self.model_configs=model_configs
     self.pretrained_model = BertForQuestionAnswering.from_pretrained(self.model_configs['pretrained_model_name'],
                                                          cache_dir=self.model_configs['cache_dir'],output_attentions=True)
     
     
     self.tokenizer = BertTokenizer.from_pretrained(self.model_configs['tokenizer_name'])
Exemple #16
0
    def create_graphics(self, url_base, model_card_path):
        pruned_heads = self.checkpoint_info["config"].get("pruned_heads")
        ret = {}
        if pruned_heads is not None:
            pruning_info_plotter = PruningInfoBokehPlotter(
                "pruning_info", self.JS_PATH)
            fig, js, html = pruning_info_plotter.run(layer_count=12,
                                                     pruned_heads=pruned_heads,
                                                     heads_count=12)
            ret["pruning_info"] = dict(js=js, html=html)

        density_plotter = DensityBokehPlotter("density", self.JS_PATH)

        model = BertForQuestionAnswering.from_pretrained(self.git_path)

        fig, js, html = density_plotter.run(model=model,
                                            dest_path=model_card_path /
                                            "images",
                                            url_base=url_base + "/images")
        ret["density_info"] = dict(js=js, html=html)

        from bokeh.io import export_png

        export_png(fig, filename="/tmp/plot.png")

        return ret
Exemple #17
0
def load_model(model_path):
    model = BertForQuestionAnswering.from_pretrained(model_path)
    model.to(device)
    model.eval()
    model.zero_grad()

    return model
 def __init__(self):
     self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     self.model_name = "nyust-eb210/braslab-bert-drcd-384"
     self.tokenizer = BertTokenizerFast.from_pretrained(self.model_name)
     self.model = BertForQuestionAnswering.from_pretrained(self.model_name).to(
         self.device
     )
Exemple #19
0
def load_qa_model():
    model = BertForQuestionAnswering.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')
    #Tokenizer
    tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')
    return model, tokenizer
def train():
    with msg.loading("   Loading BERT"):
        TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased')
        MODEL = BertForQuestionAnswering.from_pretrained(
            'bert-large-uncased-whole-word-masking-finetuned-squad')
    msg.good("   BERT loaded")

    articles_dir = os.path.join(SCRIPT_PATH,
                                '../data/raw/CORD-19-research-challenge/')
    articles_folders = [
        'biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/',
        'comm_use_subset/comm_use_subset/pdf_json/',
        'comm_use_subset/comm_use_subset/pmc_json/',
        'noncomm_use_subset/noncomm_use_subset/pdf_json/',
        'noncomm_use_subset/noncomm_use_subset/pmc_json/',
        'custom_license/custom_license/pdf_json/',
        'custom_license/custom_license/pmc_json/'
    ]
    meta_path = articles_dir + 'metadata.csv'

    with msg.loading("   Loading publications"):
        start = time.time()
        data_text, index2paperID, index2paperPath = get_data_texts(
            articles_dir, articles_folders, meta_path)
    msg.good("   Publications loaded - Took {:.2f}s".format(time.time() -
                                                            start))

    covid_q = QuestionCovid(TOKENIZER, MODEL, index2paperID, index2paperPath)
    covid_q.fit(data_text)
    return covid_q
 def _get_question_answering(self):
     """
     Initializes the BertForQuestionAnswering transformer
     NOTE: This uses the bert-large-uncased-whole-word-masking-finetuned-squad pretraining for best results.
     """
     self.qa = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
     self.qa.eval()
Exemple #22
0
    def test_patch_module_ampere(self):
        config = BertConfig.from_pretrained("bert-base-uncased")
        model = BertForQuestionAnswering(config)

        parameters = LinearPruningArgs(
            method="topK",
            submethod="default",
            ampere_method="annealing",
            block_rows=32,
            block_cols=32,
            min_elements=0.005,
        )

        context = PatcherContext()

        p = LinearPruningModulePatcher(context, parameters, self.MODEL_STRUCTURE)

        module_patchers = dict(query=p, key=p, value=p, att_dense=p, interm_dense=p, output_dense=p)

        patcher = LinearModelPatcher(module_patchers, self.MODEL_STRUCTURE)
        patcher.patch(model)

        self.assertEqual(patcher.stats["patched"], 72)
        key_sizes = {k: len(v) for k, v in context.context_modules.items()}

        self.assertEqual(key_sizes, {"ampere_mask": 72, "mask": 72})
Exemple #23
0
    def test_patch_module_tied_attention(self):
        config = BertConfig.from_pretrained("bert-base-uncased")
        model = BertForQuestionAnswering(config)

        parameters = LinearPruningParameters(
            method="topK",
            submethod="default",
            ampere_method="annealing",
            block_rows=32,
            block_cols=32,
        )

        context = PatcherContext()

        p_attention = JointPruningModulePatcher(context, parameters, "attention")
        p_dense = LinearPruningModulePatcher(context, parameters)

        module_patchers = dict(
            query=p_attention,
            key=p_attention,
            value=p_attention,
            att_dense=p_dense,
            interm_dense=p_dense,
            output_dense=p_dense,
        )

        patcher = BertLinearModelPatcher(module_patchers)
        patcher.patch(model)

        self.assertEqual(patcher.stats["patched"], 72)
        key_sizes = {k: len(v) for k, v in context.context_modules.items()}

        self.assertEqual(key_sizes, {"ampere_mask": 72, "mask": 48})
Exemple #24
0
    def __init__(self,
                 qa_path,
                 relations_filepath,
                 data_directory,
                 batch_size,
                 must_choose_answer,
                 device,
                 trained_to_reject,
                 calculate_single_error=True):
        self.trained_to_reject = trained_to_reject
        self.qa_path = qa_path  # path to qa weights
        self.relations_filepath = relations_filepath  # path to relations file
        self.data_directory = data_directory  # data directory path
        self.tokenizer = BertTokenizer.from_pretrained(
            'bert-large-cased')  # tokenizer
        self.model = BertForQuestionAnswering.from_pretrained(
            qa_path)  # Load the model
        self.model.to(device)
        self.device = device

        self.batch_size = batch_size
        self.must_choose_answer = must_choose_answer  # For datasets where there is always an answer, setting this to true will ensure that QA models that can return "answer doesn't exist" will always return a span in the context
        self.total_samples = 0
        if calculate_single_error:
            self.se_list = []
        else:
            self.se_list = None
Exemple #25
0
 def load_model(self, model_path: str, do_lower_case=False):
     config = BertConfig.from_pretrained(model_path + "/bert_config.json")
     tokenizer = BertTokenizer.from_pretrained(
         model_path, do_lower_case=do_lower_case)
     model = BertForQuestionAnswering.from_pretrained(
         model_path, from_tf=False, config=config)
     return model, tokenizer
def get_answer_using_bert(question, reference_text):
    # Load fine-tuned model for QA
    bert_model = BertForQuestionAnswering.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')

    # Load Vocab as well
    bert_tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')

    # Apply bert_tokenizer on input text
    input_ids = bert_tokenizer.encode(question, reference_text)
    input_tokens = bert_tokenizer.convert_ids_to_tokens(input_ids)

    # Search index of first [SEP] token
    sep_location = input_ids.index(bert_tokenizer.sep_token_id)
    first_seg_len, second_seg_len = sep_location + 1, len(input_ids) - (
        sep_location + 1)
    seg_embedding = [0] * first_seg_len + [1] * second_seg_len

    # Run our example on model
    model_scores = bert_model(torch.tensor([input_ids]),
                              token_type_ids=torch.tensor([seg_embedding]))
    ans_start_loc, ans_end_loc = torch.argmax(model_scores[0]), torch.argmax(
        model_scores[1])
    result = ' '.join(input_tokens[ans_start_loc:ans_end_loc + 1])

    # Return final result
    result = result.replace(' ##', '')
    return result
Exemple #27
0
 def load_model(self):
     config = BertConfig.from_pretrained(self.c_path)
     self.model = BertForQuestionAnswering.from_pretrained(
         'bert-base-uncased', config=config)
     self.model.to(self.device)
     self.model.eval()
     return self.model
Exemple #28
0
async def main(message: types.Message):
    import torch
    model = BertForQuestionAnswering.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')
    tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')
    Question = 'The purpose of the NewsQA dataset'
    paragrah = 'With massive volumes of written text being produced every second, how do we make sure that we have the most recent and relevant information available to us? Microsoft research Montreal is tackling this problem by building AI systems that can read and comprehend large volumes of complex text in real-time. The purpose of the NewsQA dataset is to help the research community build algorithms that are capable of answering questions requiring human-level comprehension and reasoning skills.'
    encoding = tokenizer.encode_plus(text=Question,
                                     text_pair=paragrah,
                                     add_special=True)
    # token embedding
    inputs = encoding['input_ids']
    #3 segment embedgin
    sentence_embed = encoding['token_type_ids']
    # input tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs)
    start_scores, end_scores = model(input_ids=torch.tensor([inputs]),
                                     token_type_ids=torch.tensor(
                                         [sentence_embed]),
                                     return_dict=False)
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    answer = ' '.join(tokens[start_index:end_index + 1])

    await message.reply(text=answer)
Exemple #29
0
def train(args):
    model = BertForQuestionAnswering.from_pretrained(PRETRAINED_MODEL_PATH)
    tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_PATH)
    model.resize_token_embeddings(len(tokenizer))
    datasets = CMRC2018(args=args, tokenizer=tokenizer)()

    training_args = TrainingArguments(
        output_dir=args.model_path,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        warmup_steps=args.warmup_steps,
        remove_unused_columns=False,
        logging_dir=args.log_path,
        num_train_epochs=args.n_epochs,
        dataloader_num_workers=args.num_workers,
        evaluation_strategy='epoch')
    print(
        f"Train dataset size: {len(datasets['train'])}, Validation dataset size: {len(datasets['validation'])}"
    )
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=datasets['train'],
                      eval_dataset=datasets['validation'])
    trainer.train()
    trainer.save_model()
    pass
def answer_question(question, answer_text, model_name=None, tokenizer_name=None):
    """
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer.

    Parameters
    ----------
    question : str
    answer_text : str
    model : str
    tokenizer : str

    Return
    -------
    answer : str
    """
    # ======== Model & Tokenizer (default: bert-large finetuned squad ver.1)========
    if model_name is None:
        model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'
    if tokenizer_name is None:
        tokenizer_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'
    
    model = BertForQuestionAnswering.from_pretrained(model_name)
    tokenizer = BertTokenizer.from_pretrained(tokenizer_name)

    # ======== Tokenize ========
    input_ids = tokenizer.encode(question, answer_text)

    # Report how long the input sequence is.
    # print(f"Query has {len(input_ids):,} tokens.\n")

    # ======== Set Segment IDs ========
    sep_index = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_index + 1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    start_scores, end_scores = model(
        torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids])
        )

    # ======== Reconstruct Answer ========
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = tokens[answer_start]
    for i in range(answer_start + 1, answer_end + 1):
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        else:
            answer += ' ' + tokens[i]

    return answer