Esempio n. 1
0
    def __init__(self, path: str, device: str = 'cpu'):
        """ Init the QA Albert """
        if not os.path.exists(path):
            raise NotADirectoryError(
                f"{os.path.abspath(path)} must be a directory containing the model files: config, tokenizer, weights.")

        files = os.listdir(path)
        if CONFIG_JSON_FILE not in files:
            raise FileNotFoundError(f"{CONFIG_JSON_FILE} must be in {path}.")
        if WEIGHTS_FILE not in files:
            raise FileNotFoundError(f"{WEIGHTS_FILE} must be in {path}.")

        with open(os.path.join(path, CONFIG_JSON_FILE), "r") as f:
            config = json.load(f)
        self.tokenizer = AutoTokenizer.from_pretrained(path)
        weights = torch.load(os.path.join(path, WEIGHTS_FILE),
                                  map_location=lambda storage, loc: storage)
        # Load pretrained model/tokenizer
        config = AlbertConfig.from_dict(config)
        self.model = AlbertForQuestionAnswering(config)
        self.model.load_state_dict(weights)
        self.model = self.model.eval()
        self.args = albert_args_squad
        if device == "cuda":
            logger.debug("Setting model with CUDA")
            self.args['device'] = 'cuda'
            self.model.to('cuda')
Esempio n. 2
0
class AlbertQA:
    """ Class to use Albert to answer questions.
    TODO: Update model and checkpoints to work with last versions of transformers """

    def __init__(self, path: str, device: str = 'cpu'):
        """ Init the QA Albert """
        if not os.path.exists(path):
            raise NotADirectoryError(
                f"{os.path.abspath(path)} must be a directory containing the model files: config, tokenizer, weights.")

        files = os.listdir(path)
        if CONFIG_JSON_FILE not in files:
            raise FileNotFoundError(f"{CONFIG_JSON_FILE} must be in {path}.")
        if WEIGHTS_FILE not in files:
            raise FileNotFoundError(f"{WEIGHTS_FILE} must be in {path}.")

        with open(os.path.join(path, CONFIG_JSON_FILE), "r") as f:
            config = json.load(f)
        self.tokenizer = AutoTokenizer.from_pretrained(path)
        weights = torch.load(os.path.join(path, WEIGHTS_FILE),
                                  map_location=lambda storage, loc: storage)
        # Load pretrained model/tokenizer
        config = AlbertConfig.from_dict(config)
        self.model = AlbertForQuestionAnswering(config)
        self.model.load_state_dict(weights)
        self.model = self.model.eval()
        self.args = albert_args_squad
        if device == "cuda":
            logger.debug("Setting model with CUDA")
            self.args['device'] = 'cuda'
            self.model.to('cuda')

    def answer(self, question: str, context: str, **kwargs: dict) -> str:
        """ Look the answer to question in context

        Keyword Arguments:
             :param question: Question to answer
             :param context: Context to look for the answer into
             :return: Answer to question
        """
        for key in kwargs:
            if key in self.args:
                self.args[key] = kwargs[key]
        inputs = self.tokenizer.encode_plus(question, context, **self.args)
        for key in inputs.keys():
            inputs[key] = inputs[key].to(self.args['device'])
        input_ids = inputs["input_ids"].tolist()[0]

        answer_start_scores, answer_end_scores = self.model(**inputs)

        answer_start = torch.argmax(answer_start_scores)  # Get the most likely beginning of answer
        answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer

        answer = self.tokenizer.convert_tokens_to_string(
            self.tokenizer.convert_ids_to_tokens(
                input_ids[answer_start:answer_end]
            )
        )
        answer = answer.replace("[CLS]", "").replace("[SEP]", " ").replace("<s>", "").replace("</s>", "")
        return answer
Esempio n. 3
0
    def __init__(self):
        ''' PRE-LOAD NECESSARY DATA '''
        # print(os.path.join('models', 'sbert.net_models_distilbert-base-nli-stsb-mean-tokens'))
        # print(os.path.join('models', 'albert_t'))
        # print(os.path.join('models', 'albert_m'))
        # print(os.getcwd())
        self.__sentence_model = SentenceTransformer(
            os.path.join(
                'models',
                'sbert.net_models_distilbert-base-nli-stsb-mean-tokens'))
        self.__tokenizer = AlbertTokenizer.from_pretrained(
            os.path.join('models', 'albert_t'))
        self.__model = AlbertForQuestionAnswering.from_pretrained(
            os.path.join('models', 'albert_m'))

        # Read url file
        with open(os.path.join('data', 'urls.txt'), 'r') as file:
            self.urls = file.read().splitlines()
            file.close()
        with open(os.path.join('data', 'titles.txt'), 'r') as file:
            self.titles = file.read().splitlines()
            file.close()

        # Load pickle files into variables
        names = [
            os.path.join('data', 'punctuated.pkl'),
            os.path.join('data', 'punctuated_embed.pkl'),
            os.path.join('data', 'subs.pkl')
        ]
        self.__punctuateds, self.__sentence_embeddings_p, self.__subs = tuple(
            map(loadPickle, names))
        ''' END OF PRE-LOAD NECESSARY DATA '''
Esempio n. 4
0
def load_and_predict(data_dir, model_type, pretrain_model):
    if model_type == 'bert_japanese':
        model = BertForQuestionAnswering.from_pretrained(
            'cl-tohoku/bert-base-japanese')
        tokenizer = BertJapaneseTokenizer.from_pretrained(
            'cl-tohoku/bert-base-japanese')

    if model_type == 'bert_multilingual':
        model = BertForQuestionAnswering.from_pretrained(
            'bert-base-multilingual-cased')
        tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased', tokenize_chinese_chars=False)

    if model_type == 'albert':
        model = AlbertForQuestionAnswering.from_pretrained(
            'ALINEAR/albert-japanese-v2')
        tokenizer = AlbertTokenizer.from_pretrained(
            'ALINEAR/albert-japanese-v2')

    test_data = TestData(data_dir, TAG)
    testset = QADataset(test_data.examples, "test", tokenizer=tokenizer)
    testloader = DataLoader(testset, batch_size=4, collate_fn=collate_fn)

    model = model.to(device)
    model.load_state_dict(torch.load(pretrain_model))

    prediction = predict(model, testloader, device, tokenizer)
    prediction = func(data_dir, prediction)
    print('finish loading and predicting from {}!'.format(pretrain_model))
    return prediction  #prediction dictionary
Esempio n. 5
0
    def __init__(self, config):
        super(AlbertNCPQA, self).__init__(config)

        self.mrc = AlbertForQuestionAnswering(config)
        self.content = nn.Linear(config.hidden_size, 1)

        self.init_weights()
Esempio n. 6
0
 def __init__(self, name, path: str, gpu=False):
     self.tokenizer = AlbertTokenizer.from_pretrained(path)
     pretrained_albert_model = AlbertForQuestionAnswering.from_pretrained(
         path)
     super().__init__(name, pretrained_albert_model, gpu)
     if self.gpu:
         self.predictor.cuda()
Esempio n. 7
0
def build(args):
    TAG = create_tags()
    XLSX_PATH = {'train': 'release/train/ca_data', 'dev': 'release/dev/ca_data', 'test': 'release/test/ca_data'}
    
    PRETRAINED_MODEL_NAME = 'ALINEAR/albert-japanese-v2'
    tokenizer = AlbertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
    
    train_data = TrainData(XLSX_PATH['train'], TAG, only_positive=args.only_positive)

    trainset = QADataset(train_data.examples, "train", tokenizer=tokenizer)
    trainloader = DataLoader(trainset, batch_size=args.batch_size, collate_fn=collate_fn)
    
    dev_data = TrainData(XLSX_PATH['dev'], TAG, only_positive=args.only_positive)
    
    devset = QADataset(dev_data.examples, "train", tokenizer=tokenizer)
    devloader = DataLoader(devset, batch_size=args.batch_size, collate_fn=collate_fn)

    logger.info(f"[train data] {train_data.summary()}")
    logger.info(f"[dev data] {dev_data.summary()}")
    
    test_data = TestData(XLSX_PATH['dev'], TAG)
    testset = QADataset(test_data.examples, "test", tokenizer=tokenizer)
    testloader = DataLoader(testset, batch_size=args.batch_size, collate_fn=collate_fn)
    
    model = AlbertForQuestionAnswering.from_pretrained(PRETRAINED_MODEL_NAME)
    model = model.to(args.device)

    if args.load_pretrained_model:
        model.load_state_dict(torch.load(args.pretrained_model_path))
    
    return model, trainloader, devloader, testloader, tokenizer
Esempio n. 8
0
 def load_model(self, model_path: str, do_lower_case=True):
     config = AlbertConfig.from_pretrained(model_path + "/config.json")
     tokenizer = AlbertTokenizer.from_pretrained(model_path)
     #tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2', do_lower_case=do_lower_case)
     model = AlbertForQuestionAnswering.from_pretrained(model_path,
                                                        from_tf=False,
                                                        config=config)
     return model, tokenizer
Esempio n. 9
0
def load_model(pretrained_model):
    # Other models to try: albert-large-v2, albert-xlarge-v2
    # https://huggingface.co/transformers/pretrained_models.html
    tokenizer = AlbertTokenizer.from_pretrained(pretrained_model,
                                                do_lower_case=True)
    model = AlbertForQuestionAnswering.from_pretrained(pretrained_model,
                                                       cache_dir="/usr/cache")
    return model, tokenizer
Esempio n. 10
0
 def create_and_check_albert_for_question_answering(
         self, config, input_ids, token_type_ids, input_mask,
         sequence_labels, token_labels, choice_labels):
     model = AlbertForQuestionAnswering(config=config)
     model.eval()
     loss, start_logits, end_logits = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         start_positions=sequence_labels,
         end_positions=sequence_labels)
     result = {
         "loss": loss,
         "start_logits": start_logits,
         "end_logits": end_logits,
     }
     self.parent.assertListEqual(list(result["start_logits"].size()),
                                 [self.batch_size, self.seq_length])
     self.parent.assertListEqual(list(result["end_logits"].size()),
                                 [self.batch_size, self.seq_length])
     self.check_loss_output(result)
 def create_and_check_for_question_answering(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = AlbertForQuestionAnswering(config=config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         start_positions=sequence_labels,
         end_positions=sequence_labels,
     )
     self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
     self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
Esempio n. 12
0
def load_models():
    # Download files locally if not exist from S3 bucket
    s3_bucket = boto3.resource('s3').Bucket('albert-model-files')
    for object in s3_bucket.objects.all():
        if object.key in ["config.json", "vocab.txt", "pytorch_model.bin"]:
            if not os.path.exists('model_data/{}'.format(object.key)):
                s3_bucket.download_file(object.key,
                                        'model_data/{}'.format(object.key))

    for object in s3_bucket.objects.all():
        if object.key in [
                "special_tokens_map.json", "spiece.model",
                "tokenizer_config.json"
        ]:
            if not os.path.exists('tokenizer_albert/{}'.format(object.key)):
                s3_bucket.download_file(
                    object.key, 'tokenizer_albert/{}'.format(object.key))

    # Load pretrained models
    tokenizer = AlbertTokenizer.from_pretrained('./tokenizer_albert')
    model = AlbertForQuestionAnswering.from_pretrained('./model_data')
    return model, tokenizer
Esempio n. 13
0
                        else:
                            write_in_result_file("Bert")

                    write_in_result_file("k = " + str(k))

                    if use_ir_score:
                        write_in_result_file('Using IR score with mu = ' +
                                             str(mu_bench))
                    else:
                        write_in_result_file('Not using IR score')

                    if use_albert:
                        if not use_dil:
                            tokenizer = AlbertTokenizer.from_pretrained(
                                args.albert_path, do_lower_case=True)
                            model = AlbertForQuestionAnswering.from_pretrained(
                                args.albert_path)
                        else:
                            tokenizer = AlbertTokenizer.from_pretrained(
                                args.dilalbert_path, do_lower_case=True)
                            model = DilAlbert.from_pretrained(
                                args.dilalbert_path)
                    else:
                        if not use_dil:
                            tokenizer = BertTokenizer.from_pretrained(
                                args.bert_path, do_lower_case=True)
                            model = BertForQuestionAnswering.from_pretrained(
                                args.bert_path)
                        else:
                            tokenizer = BertTokenizer.from_pretrained(
                                args.dilbert_path, do_lower_case=True)
                            model = DilBert.from_pretrained(args.dilbert_path)
Esempio n. 14
0
def do_prediction(model_dir, model_name, questions_dir):
    # 1. Load a trained model

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AlbertForQuestionAnswering.from_pretrained(model_dir)
    model.to(device)
    model.eval()

    # 2. Load and pre-process the test set

    dev_file = questions_dir  #"data/sfu.json"
    predict_batch_size = 2
    max_seq_length = 384

    eval_examples = read_squad_examples(input_file=dev_file,
                                        is_training=False,
                                        version_2_with_negative=False)

    tokenizer = AlbertTokenizer.from_pretrained(model_dir)
    eval_features = convert_examples_to_features(examples=eval_examples,
                                                 tokenizer=tokenizer,
                                                 max_seq_length=max_seq_length,
                                                 doc_stride=128,
                                                 max_query_length=64,
                                                 is_training=False)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_example_index)

    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=predict_batch_size)

    # 3. Run inference on the test set

    all_results = []
    for input_ids, input_mask, segment_ids, example_indices in tqdm(
            eval_dataloader):

        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():
            batch_start_logits, batch_end_logits = model(
                input_ids, input_mask, segment_ids)

        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

    output_prediction_file = os.path.join(
        model_dir, model_name + "_albert_predictions_sfu.json")
    output_nbest_file = os.path.join(
        model_dir, model_name + "_albert_nbest_predictions_sfu.json")
    output_null_log_odds_file = os.path.join(
        model_dir, model_name + "_null_odds_sfu.json")

    preds = write_predictions(eval_examples, eval_features, all_results, 20,
                              30, True, output_prediction_file,
                              output_nbest_file, output_null_log_odds_file,
                              True, False, 0.0)
Esempio n. 15
0
                "input_ids": torch.cat((input_ids,inputs_par[i]["input_ids"]),1),
                "attention_mask": torch.cat((attention_mask,inputs_par[i]["attention_mask"]),1),
                "token_type_ids": torch.cat((token_type_ids,inputs_par[i]["token_type_ids"]),1),
            }
            start = time.time()
            outputs = model.process_B(preprocessed_question, preprocessed_paragraph,**inputs)
            total_time_questions_paragraphs_pairs = total_time_questions_paragraphs_pairs + time.time() - start


print("NI Q DilAlbert :", total_time_questions)
print("I Q-P DilAlbert :", total_time_questions_paragraphs_pairs)
total_dilalbert = total_time_passages+total_time_questions+total_time_questions_paragraphs_pairs
print("Total DilAlbert : ", total_dilalbert)

tokenizer = AlbertTokenizer.from_pretrained(ALBERT_PATH, do_lower_case=True)
model = AlbertForQuestionAnswering.from_pretrained(ALBERT_PATH)
model.to(torch.device(device))

total_time_questions_paragraphs_pairs_albert = 0
eval_dataloader = DataLoader(dataset, batch_size=1)
for question in squad1_for_orqa["questions"][:n_questions]:
    input_ids = torch.tensor([tokenizer.encode(question)], device=device)
    attention_mask = torch.tensor([[1]*input_ids.shape[1]], device=device)
    token_type_ids = torch.tensor([[0]*input_ids.shape[1]], device=device)
    for batch in eval_dataloader:
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": torch.cat((input_ids,batch[0][:,3:]),1),
                "attention_mask": torch.cat((attention_mask,batch[1][:,3:]),1),
                "token_type_ids": torch.cat((token_type_ids,batch[2][:,3:]),1),
Esempio n. 16
0
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

from transformers import AlbertForQuestionAnswering
model = AlbertForQuestionAnswering.from_pretrained("albert-base-v2")

from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
Esempio n. 17
0
def main(args):
    if args.large:
        args.train_record_file += '_large'
        args.dev_eval_file += '_large'
        model_name = "albert-xlarge-v2"
    else:
        model_name = "albert-base-v2"
    if args.xxlarge:
        args.train_record_file += '_xxlarge'
        args.dev_eval_file += '_xxlarge'
        model_name = "albert-xxlarge-v2"
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get model
    log.info('Building model...')
    if args.bidaf:
        char_vectors = util.torch_from_json(args.char_emb_file)

    if args.model_name == 'albert_highway':
        model = models.albert_highway(model_name)
    elif args.model_name == 'albert_lstm_highway':
        model = models.LSTM_highway(model_name, hidden_size=args.hidden_size)
    elif args.model_name == 'albert_bidaf':
        model = models.BiDAF(char_vectors=char_vectors,
                             hidden_size=args.hidden_size,
                             drop_prob=args.drop_prob)
    elif args.model_name == 'albert_bidaf2':
        model = models.BiDAF2(model_name=model_name,
                              char_vectors=char_vectors,
                              hidden_size=args.hidden_size,
                              drop_prob=args.drop_prob)
    else:
        model = AlbertForQuestionAnswering.from_pretrained(args.model_name)

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2,
                          args.bidaf)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers)
    dev_dataset = SQuAD(args.dev_eval_file, args.use_squad_v2, args.bidaf)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers)

    with open(args.dev_gold_file) as f:
        gold_dict = json.load(f)

    tokenizer = AlbertTokenizer.from_pretrained(model_name)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for batch in train_loader:
                batch = tuple(t.to(device) for t in batch)
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    'start_positions': batch[3],
                    'end_positions': batch[4],
                }
                if args.bidaf:
                    inputs['char_ids'] = batch[6]
                y1 = batch[3]
                y2 = batch[4]
                # Setup for forward
                batch_size = inputs["input_ids"].size(0)
                optimizer.zero_grad()

                # Forward
                # log_p1, log_p2 = model(**inputs)
                y1, y2 = y1.to(device), y2.to(device)
                outputs = model(**inputs)
                loss = outputs[0]
                loss = loss.mean()
                # loss_fct = nn.CrossEntropyLoss()
                # loss = loss_fct(log_p1, y1) + loss_fct(log_p2, y2)
                # loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(args, model, dev_dataset,
                                                  dev_loader, gold_dict,
                                                  tokenizer, device,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
Esempio n. 18
0
def document_retriever(question):
    query_bow = dictionary.doc2bow(jieba.cut(question,cut_all=False))
    tfidfvect = tfidf[query_bow]
    simstfidf = indexTfidf[tfidfvect]
    return [context[i] for i in (-simstfidf).argsort()[0:1]]#返回前3名



model_path = 'voidful/albert_chinese_base'
tokenizer_kwards = {'do_lower_case': False,'max_len': 512}
tokenizer = BertTokenizer.from_pretrained(model_path, **tokenizer_kwards)
from transformers import AlbertForQuestionAnswering,AutoConfig
model_path = 'voidful/albert_chinese_base'
bert_config = AutoConfig.from_pretrained(model_path)
model = AlbertForQuestionAnswering.from_pretrained(r"checkpoint_score_f1-86.233_em-66.853.pth",
                                                   **{'config':bert_config}).to('cuda')
import torch
from torch.utils.data import TensorDataset, DataLoader


SPIECE_UNDERLINE = '▁'
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
                         orig_answer_text):
    """Returns tokenized answer spans that better match the annotated answer."""

    # The SQuAD annotations are character based. We first project them to
    # whitespace-tokenized words. But then after WordPiece tokenization, we can
    # often find a "better match". For example:
    #
    #   Question: What year was John Smith born?
    #   Context: The leader was John Smith (1895-1943).
Esempio n. 19
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        required=True,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        required=True,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")

    parser.add_argument(
        '--version_2_with_negative',
        action='store_true',
        help=
        'If true, the SQuAD examples contain some that do not have an answer.')
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null."
    )

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json output file."
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=500,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    #config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
    #                                      cache_dir=args.cache_dir if args.cache_dir else None)
    #tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
    #                                            do_lower_case=args.do_lower_case,
    #                                            cache_dir=args.cache_dir if args.cache_dir else None)
    #model = model_class.from_pretrained(args.model_name_or_path,
    #                                    from_tf=bool('.ckpt' in args.model_name_or_path),
    #                                    config=config,
    #                                    cache_dir=args.cache_dir if args.cache_dir else None)
    config = AlbertConfig.from_pretrained(args.model_name_or_path +
                                          "/config.json")
    tokenizer = AlbertTokenizer.from_pretrained(
        'albert-large-v2', do_lower_case=args.do_lower_case)
    model = AlbertForQuestionAnswering.from_pretrained(args.model_name_or_path,
                                                       from_tf=False,
                                                       config=config)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
    # remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, 'einsum')
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                evaluate=False,
                                                output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Save the trained model and the tokenizer
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir,
                                            force_download=True)
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    """
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs

        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint, force_download=True)
            model.to(args.device)

            # Evaluate
            result = evaluate(args, model, tokenizer, prefix=global_step)

            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
            results.update(result)
    """
    results = evaluate(args, model, tokenizer)
    print(results)
    logger.info("Results: {}".format(results))

    return results
Esempio n. 20
0
# ner_reljson=tkitFile.Json("../tdata/onlyner/dev.json")
# i=0
# all=0
# # ner_list=ner_plus(text)
# for item in ner_reljson.auto_load():

# The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
# examples/run_squad.py example to see how to fine-tune a model to a question answering task.

from transformers import AlbertTokenizer, AlbertForQuestionAnswering, BertTokenizer, AlbertConfig
import torch

# tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
tokenizer = BertTokenizer.from_pretrained('tkitfiles/qa/model/')
# config=AlbertConfig.from_pretrained('tkitfiles/qa/model/config.json')
model = AlbertForQuestionAnswering.from_pretrained('tkitfiles/qa/model/')

data = tkitFile.Json("../tdata/SQuAD/dev.json")
i = 0
all = 0
f = 0
for item in data.auto_load():
    for one in item['data']:
        all = all + 1
        # print(one['paragraphs'][0])
        # print(one['paragraphs'][0]['context'])
        question, text = one['paragraphs'][0]['qas'][0]['question'], one[
            'paragraphs'][0]['context']

        # question, text = "利比里亚共和国", "利比里亚共和国(英语:') 通称赖比瑞亚,是位于西非,北接几内亚,西北界塞拉利昂,东邻象牙海岸,西南濒大西洋的总统制共和国家"
        input_dict = tokenizer.encode_plus(question, text, return_tensors='pt')
Esempio n. 21
0
import torch
from transformers import AlbertTokenizer, AlbertForQuestionAnswering
tokenizer = AlbertTokenizer.from_pretrained(
    'ahotrod/albert_xxlargev1_squad2_512')
model = AlbertForQuestionAnswering.from_pretrained(
    'ahotrod/albert_xxlargev1_squad2_512')


def answer(question, text):
    input_dict = tokenizer.encode_plus(question,
                                       text,
                                       return_tensors='pt',
                                       max_length=512)
    input_ids = input_dict["input_ids"].tolist()
    start_scores, end_scores = model(**input_dict)

    start = torch.argmax(start_scores)
    end = torch.argmax(end_scores)

    all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    answer = ''.join(all_tokens[start:end + 1]).replace('▁', ' ').strip()
    answer = answer.replace('[SEP]', '')
    return answer if answer != '[CLS]' and len(
        answer) != 0 else 'could not find an answer'
import torch
from transformers import AlbertTokenizer, AlbertForQuestionAnswering
tokenizer = AlbertTokenizer.from_pretrained('twmkn9/albert-base-v2-squad2')
model = AlbertForQuestionAnswering.from_pretrained(
    'twmkn9/albert-base-v2-squad2')


def answer(question, text):
    input_dict = tokenizer.encode_plus(question,
                                       text,
                                       return_tensors='pt',
                                       max_length=512)
    input_ids = input_dict["input_ids"].tolist()
    start_scores, end_scores = model(**input_dict)

    start = torch.argmax(start_scores)
    end = torch.argmax(end_scores)

    all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    answer = ''.join(all_tokens[start:end + 1]).replace('▁', ' ').strip()
    answer = answer.replace('[SEP]', '')
    return answer if answer != '[CLS]' and len(
        answer) != 0 else 'could not find an answer'