Exemple #1
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)
    model = DistilBertForQuestionAnswering.from_pretrained(
        "distilbert-base-uncased")
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')

    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        if args.resume_training:
            checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
            model = DistilBertForQuestionAnswering.from_pretrained(
                checkpoint_path)
            model.to(args.device)
        else:
            args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        trainer = Trainer(args, log)
        train_dataset, _ = get_dataset(args, args.train_datasets,
                                       args.train_dir, tokenizer, 'train',
                                       args.outdomain_data_repeat)
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets,
                                            args.val_dir, tokenizer, 'val',
                                            args.outdomain_data_repeat)
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=RandomSampler(val_dataset))
        best_scores = trainer.train(model, train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        discriminator_input_size = 768
        if args.full_adv:
            discriminator_input_size = 384 * 768
        discriminator = DomainDiscriminator(
            input_size=discriminator_input_size)
        # discriminator.load_state_dict(torch.load(checkpoint_path + '/discriminator'))
        model.to(args.device)
        discriminator.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                              args.eval_dir, tokenizer,
                                              split_name,
                                              args.outdomain_data_repeat)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   discriminator,
                                                   eval_loader,
                                                   eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)

    #### Change Made By Xuran Wang: Comment out original lines #######

    # model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
    # tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    #### Change End #######

    #### Change Made By Xuran Wang: Add custom lines #######

    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')
    finetuned_model_path = 'save/baseline-01/'

    #### Change End #######

    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")

        #### Change Made By Xuran Wang: Add custom lines #######

        checkpoint_path = os.path.join(finetuned_model_path, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)

        #### Change End #######
        '''###'''
        # if args.reinit_pooler:
        #     encoder_temp = getattr(model, "distilbert")  # Equivalent to model.distilbert
        #     encoder_temp.pooler.dense.weight.data.normal_(mean=0.0, std=encoder_temp.config.initializer_range)
        #     encoder_temp.pooler.dense.bias.data.zero_()  # The change of encoder_temp would affect the model
        #     for p in encoder_temp.pooler.parameters():
        #         p.requires_grad = True

        if args.reinit_layers > 0:
            import torch.nn as nn
            from transformers.models.distilbert.modeling_distilbert import MultiHeadSelfAttention, FFN
            # model_distilbert = getattr(model, "distilbert")  # model.distilbert; change of model_distilbert affects model!
            # Reinitialization for the last few layers
            for layer in model.distilbert.transformer.layer[-args.
                                                            reinit_layers:]:
                for module in layer.modules():
                    # print(module)
                    model.distilbert._init_weights(
                        module)  # It's the line equivalent to below approach
                    # if isinstance(module, nn.modules.linear.Linear):  # Original form for nn.Linear
                    #     # model.config.initializer_range == model.distilbert.config.initializer_range => True
                    #     module.weight.data.normal_(mean=0.0, std=model.distilbert.config.initializer_range)
                    #     if module.bias is not None:
                    #         module.bias.data.zero_()
                    # elif isinstance(module, nn.modules.normalization.LayerNorm):
                    #     module.weight.data.fill_(1.0)
                    #     module.bias.data.zero_()
                    # elif isinstance(module, FFN):
                    #     for param in [module.lin1, module.lin2]:
                    #         param.weight.data.normal_(mean=0.0, std=model.distilbert.config.initializer_range)
                    #         if param.bias is not None:
                    #             param.bias.data.zero_()
                    # elif isinstance(module, MultiHeadSelfAttention):
                    #     for param in [module.q_lin, module.k_lin, module.v_lin, module.out_lin]:
                    #         param.data.weight.normal_(mean=0.0, std=model.distilbert.config.initializer_range)
                    #         if param.bias is not None:
                    #             param.bias.data.zero_()

        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        model.to(args.device)

        trainer = Trainer(args, log)

        #### Change Made By Xuran Wang: Add custom lines, comment out original line #######

        # train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train')

        train_dataset, _ = get_dataset_eda_revised(args, args.train_datasets,
                                                   args.train_dir, tokenizer,
                                                   'train', train_fraction)

        #### Change End #######

        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets,
                                            args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(model, train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        model.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                              args.eval_dir, tokenizer,
                                              split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   eval_loader,
                                                   eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
def main():
    """
    Main function
    """

    # Parse cmd line arguments
    args = nlp_parser.parse_arguments()

    source = ""
    subject = ""
    context = ""
    question = ""
    answer = ""

    # Setup the question, either from a specified SQuAD record
    # or from cmd line arguments.
    # If no question details are provided, a random
    # SQuAD example will be chosen.
    if args["question"] is not None:
        question = args["question"]
        if args["text"] is not None:
            source = args["text"]
            with open(source, "r") as text_file_handle:
                context = text_file_handle.read()

        else:
            print("No text provided, searching SQuAD dev-2.0 dataset")
            squad_data = nlp.import_squad_data()
            squad_records = squad_data.loc[squad_data["question"] == question]
            if squad_records.empty:
                sys.exit(
                    "Question not found in SQuAD data, please provide context using `--text`."
                )
            subject = squad_records["subject"].iloc[0]
            context = squad_records["context"].iloc[0]
            question = squad_records["question"].iloc[0]
            answer = squad_records["answer"]

    else:
        squad_data = nlp.import_squad_data()

        if args["squadid"] is not None:
            source = args["squadid"]
            squad_records = squad_data.loc[squad_data["id"] == source]
            i_record = 0
        else:
            if args["subject"] is not None:
                print(
                    "Picking a question at random on the subject: ",
                    args["subject"],
                )
                squad_records = squad_data.loc[
                    squad_data["subject"] == args["subject"]
                ]
            else:
                print(
                    "No SQuAD ID or question provided, picking one at random!"
                )
                squad_records = squad_data

            n_records = len(squad_records.index)
            i_record = random.randint(0, max(0, n_records - 1))

        if squad_records.empty:
            sys.exit(
                "No questions found in SQuAD data, please provide valid ID or subject."
            )

        n_records = len(squad_records.index)
        i_record = random.randint(0, n_records - 1)
        source = squad_records["id"].iloc[i_record]
        subject = squad_records["subject"].iloc[i_record]
        context = squad_records["context"].iloc[i_record]
        question = squad_records["question"].iloc[i_record]
        answer = squad_records["answer"].iloc[i_record]

    # DistilBERT question answering using pre-trained model.
    token = DistilBertTokenizer.from_pretrained(
        "distilbert-base-uncased", return_token_type_ids=True
    )

    model = DistilBertForQuestionAnswering.from_pretrained(
        "distilbert-base-uncased-distilled-squad"
    )

    encoding = token.encode_plus(question, context)

    input_ids, attention_mask = (
        encoding["input_ids"],
        encoding["attention_mask"],
    )
    start_scores, end_scores = model(
        torch.tensor([input_ids]),
        attention_mask=torch.tensor([attention_mask]),
        return_dict=False,
    )

    answer_ids = input_ids[
        torch.argmax(start_scores) : torch.argmax(end_scores) + 1
    ]
    answer_tokens = token.convert_ids_to_tokens(
        answer_ids, skip_special_tokens=True
    )
    answer_tokens_to_string = token.convert_tokens_to_string(answer_tokens)

    # Display results
    print("\nDistilBERT question answering example.")
    print("======================================")
    print("Reading from: ", subject, source)
    print("\nContext: ", context)
    print("--")
    print("Question: ", question)
    print("Answer: ", answer_tokens_to_string)
    print("Reference Answers: ", answer)
Exemple #4
0
    return contexts, questions, ids


device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    torch.cuda.set_device(DEVICE_ID)  # use an unoccupied GPU
'''
load data
'''
val_contexts, val_questions, val_ids = read_squad('data/dev-v2.0.json')
'''
tokenizers and models
'''
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained(
    'distilbert-base-uncased').to(device)
model.load_state_dict(
    torch.load(os.path.join('model_weights',
                            f'distilBERT_epoch_{NUM_EPOCH}.pt'),
               map_location=device))

model.eval()

res = dict()
with torch.no_grad():
    for i, (context, question,
            id) in tqdm(enumerate(zip(val_contexts, val_questions, val_ids))):
        encoding = tokenizer(context,
                             question,
                             return_tensors='pt',
                             truncation=True)
Exemple #5
0
# argument parsing
app = Flask(__name__)
api = Api(app)
parser = reqparse.RequestParser()
parser.add_argument('question')

N_HITS = 10
# TODO: Analyse the hard-coded keywords and assess if anything needs to change here.
KEYWORDS = ''
# LUCENE_DATABASE_DIR = '/mnt/lucene-database'
LUCENE_DATABASE_PATH = 'lucene-index-covid-2020-04-10'

# Load these models locally - distilbert-base-uncased-distilled-squad
DISTILBERT_MODEL_PATH = 'distilbert-base-uncased-distilled-squad'
model = DistilBertForQuestionAnswering.from_pretrained(DISTILBERT_MODEL_PATH)
tokenizer = DistilBertTokenizer.from_pretrained(DISTILBERT_MODEL_PATH)

# document = "Victoria has a written constitution enacted in 1975, but based on the 1855 colonial constitution, passed by the United Kingdom Parliament as the Victoria Constitution Act 1855, which establishes the Parliament as the state's law-making body for matters coming under state responsibility. The Victorian Constitution can be amended by the Parliament of Victoria, except for certain 'entrenched' provisions that require either an absolute majority in both houses, a three-fifths majority in both houses, or the approval of the Victorian people in a referendum, depending on the provision."
# input_ids = tokenizer.encode('Why is this strange thing here?')
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
# start_scores, end_scores = model(torch.tensor([input_ids[:512]]))


def makeBERTSQuADPrediction(model, document, question):
    input_ids = tokenizer.encode(question, document)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    sep_index = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_index + 1
    num_seg_b = len(input_ids) - num_seg_a
Exemple #6
0
 def __init__(self):
     self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',return_token_type_ids = True)
     self.model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')
Exemple #7
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.model = DistilBertForQuestionAnswering.from_pretrained(
         self.model_dir)
     self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_dir)
Exemple #8
0
class BERTQA:

    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',
                                                    return_token_type_ids=True)
    model = DistilBertForQuestionAnswering.from_pretrained(
        'distilbert-base-uncased-distilled-squad')
    MAX_TOKENS = 512
    MAX_TOKENS_QUESTION = 30
    MAX_TOKENS_DOCUMENT = MAX_TOKENS - MAX_TOKENS_QUESTION - 2  # [SEP] and [CLS]

    def __init__(self):
        pass

    def get_token_length(self, string):
        tokens = self.tokenizer.encode(string)
        return len(tokens)

    def chunk_document(self, document, re_consolidate=True):
        '''Chunks up a long document into optimally large pieces so that they
        can be passed to BERT. Activating `re_consolidate` will put the chunks
        back together to make them as large as possible for improved
        performance.
        '''
        document_length = self.get_token_length(document)
        if document_length > self.MAX_TOKENS_DOCUMENT:
            approved_chunks = []
            paragraphs = document.split('\n')
            paragraphs = [par for par in paragraphs if par]
            for paragraph in paragraphs:
                paragraph_length = self.get_token_length(paragraph)
                if paragraph_length > self.MAX_TOKENS_DOCUMENT:
                    sentences = paragraph.split('.')
                    sentences = [sen for sen in sentences if sen]
                    for sentence in sentences:
                        sentence_length = self.get_token_length(sentence)
                        if sentence_length > self.MAX_TOKENS_DOCUMENT:
                            print("Ignoring overlong sentence.")
                        else:
                            approved_chunks.append(sentence)
                else:
                    approved_chunks.append(paragraph)
            if re_consolidate:
                lengths = [
                    self.get_token_length(chunk) for chunk in approved_chunks
                ]
                consolidated_chunks = []
                running_length = 0
                current_chunk = ''
                for chunk, length in zip(approved_chunks, lengths):
                    if (running_length + length) < self.MAX_TOKENS_DOCUMENT:
                        current_chunk += chunk
                        running_length += length
                    else:
                        consolidated_chunks.append(current_chunk)
                        current_chunk = chunk
                        running_length = length
                return consolidated_chunks
            else:
                return approved_chunks
        else:
            return [document]

    def answer_question(self, question, document):
        '''Takes a `question` string and an `document` string (which contains
        the answer), and identifies the words within the `document` that are
        the answer.
        '''
        question_length = self.get_token_length(question)
        document_length = self.get_token_length(document)
        if question_length > self.MAX_TOKENS_QUESTION:
            msg = f'Question exceeds max token length ({str(question_length)}).'
            raise ValueError(msg)
        if document_length > self.MAX_TOKENS_DOCUMENT:
            msg = f'Document exceeds max token length ({str(document_length)}).'
            raise ValueError(msg)
        encoding = self.tokenizer.encode_plus(question, document)
        input_ids, attention_mask = encoding["input_ids"], encoding[
            "attention_mask"]
        start_scores, end_scores = self.model(torch.tensor([input_ids]),
                                              attention_mask=torch.tensor(
                                                  [attention_mask]))
        confidence = float(max(torch.max(start_scores), torch.max(end_scores)))

        start_token = torch.argmax(start_scores)
        end_token = torch.argmax(end_scores)
        ans_tokens = input_ids[torch.argmax(start_scores
                                            ):torch.argmax(end_scores) + 1]
        answer_tokens = self.tokenizer.convert_ids_to_tokens(
            ans_tokens, skip_special_tokens=True)
        if not answer_tokens:  # TODO Understand this bug
            return '<NO ANSWER>', -10
        else:
            answer = answer_tokens[0]
            for token in answer_tokens[1:]:
                if token[0:2] == '##':
                    answer += token[2:]
                else:
                    answer += ' ' + token
            return answer, confidence

    def answer_question_chunked(self, question, document, re_consolidate=True):
        chunks = self.chunk_document(document, re_consolidate=True)
        responses = []
        for chunk in tqdm(chunks):
            answer, confidence = self.answer_question(question, chunk)
            response = {
                'answer': answer,
                'confidence': confidence,
                'chunk': chunk
            }
            responses.append(response)
        responses.sort(key=lambda x: -x['confidence'])
        return responses
def do_prediction(model_dir):
    # 1. Load a trained model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = DistilBertForQuestionAnswering.from_pretrained(model_dir)
    model.to(device)
    model.eval()

    # 2. Load and pre-process the test set

    dev_file = "data/sfu.json"
    predict_batch_size = 2
    max_seq_length = 384

    eval_examples = read_squad_examples(input_file=dev_file,
                                        is_training=False,
                                        version_2_with_negative=False)

    tokenizer = DistilBertTokenizer.from_pretrained(model_dir)
    eval_features = convert_examples_to_features(examples=eval_examples,
                                                 tokenizer=tokenizer,
                                                 max_seq_length=max_seq_length,
                                                 doc_stride=128,
                                                 max_query_length=64,
                                                 is_training=False)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_example_index)

    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=predict_batch_size)

    # 3. Run inference on the test set

    all_results = []
    for input_ids, input_mask, segment_ids, example_indices in tqdm(
            eval_dataloader):

        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():
            batch_start_logits, batch_end_logits = model(input_ids, input_mask)

        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

    output_prediction_file = os.path.join(model_dir, "predictions_sfu.json")
    output_nbest_file = os.path.join(model_dir, "nbest_predictions_sfu.json")
    output_null_log_odds_file = os.path.join(model_dir, "null_odds_sfu.json")

    preds = write_predictions(eval_examples, eval_features, all_results, 20,
                              30, True, output_prediction_file,
                              output_nbest_file, output_null_log_odds_file,
                              True, False, 0.0)
Exemple #10
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)
    model = DistilBertForQuestionAnswering.from_pretrained(
        args.model_checkpoint)
    tokenizer = DistilBertTokenizerFast.from_pretrained(args.model_checkpoint)
    with wandb.init(project="qa-system", config=args) as run:
        run.name = args.run_name
        wandb.watch(model)
        if args.do_train:
            if not os.path.exists(args.save_dir):
                os.makedirs(args.save_dir)
            args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
            log = util.get_logger(args.save_dir, 'log_train')
            log.info(
                f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
            log.info("Preparing Training Data...")
            args.device = torch.device(
                'cuda') if torch.cuda.is_available() else torch.device('cpu')
            trainer = Trainer(args, log)
            train_dataset, _ = get_dataset(args, args.train_datasets,
                                           args.train_dir, tokenizer, 'train')
            log.info("Preparing Validation Data...")
            val_dataset, val_dict = get_dataset(args, args.val_datasets,
                                                args.val_dir, tokenizer, 'val')
            train_loader = DataLoader(train_dataset,
                                      batch_size=args.batch_size,
                                      sampler=RandomSampler(train_dataset))
            val_loader = DataLoader(val_dataset,
                                    batch_size=args.batch_size,
                                    sampler=SequentialSampler(val_dataset))
            best_scores = trainer.train(model, train_loader, val_loader,
                                        val_dict)
            model_artifact = wandb.Artifact(
                args.run_name,
                type="model",
            )
            model_artifact.add_dir(os.path.join(args.save_dir, 'checkpoint'))
            run.log_artifact(model_artifact)

        if args.do_eval:
            args.device = torch.device(
                'cuda') if torch.cuda.is_available() else torch.device('cpu')
            split_name = 'test' if 'test' in args.eval_dir else 'validation'
            log = util.get_logger(args.save_dir, f'log_{split_name}')
            trainer = Trainer(args, log)
            if args.checkpoint_path != "":
                model = DistilBertForQuestionAnswering.from_pretrained(
                    args.checkpoint_path)
            else:
                checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
                model = DistilBertForQuestionAnswering.from_pretrained(
                    checkpoint_path)
            model.to(args.device)
            eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                                  args.eval_dir, tokenizer,
                                                  split_name)
            eval_loader = DataLoader(eval_dataset,
                                     batch_size=args.batch_size,
                                     sampler=SequentialSampler(eval_dataset))
            eval_preds, eval_scores = trainer.evaluate(model,
                                                       eval_loader,
                                                       eval_dict,
                                                       return_preds=True,
                                                       split=split_name)
            results_str = ', '.join(f'{k}: {v:05.2f}'
                                    for k, v in eval_scores.items())
            log.info(f'Eval {results_str}')
            # Write submission file
            sub_path = os.path.join(args.save_dir,
                                    split_name + '_' + args.sub_file)
            log.info(f'Writing submission file to {sub_path}...')
            with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
                csv_writer = csv.writer(csv_fh, delimiter=',')
                csv_writer.writerow(['Id', 'Predicted'])
                for uuid in sorted(eval_preds):
                    csv_writer.writerow([uuid, eval_preds[uuid]])
def main():
    parser = ArgumentParser('SQuAD evaluation example')
    parser.add_argument(
        '--squad_dir',
        type=str,
        metavar='PATH',
        required=True,
        help='Path to directory containing the SQuAD data (JSON-files).')
    parser.add_argument(
        '--output_dir',
        type=str,
        metavar='PATH',
        required=True,
        help=
        'Path to the output directory (for logs, checkpoints, parameters, etc.).'
    )
    parser.add_argument('-f',
                        '--force',
                        action='store_true',
                        help='Overwrite `output_dir` if it already exists.')
    parser.add_argument('--do_train',
                        action='store_true',
                        help='Whether to run training.')
    parser.add_argument('--do_eval',
                        action='store_true',
                        help='Whether to run eval (on the dev set).')
    parser.add_argument('--config_file',
                        type=str,
                        metavar='PATH',
                        required=True,
                        help='Path to the model configuration.')
    parser.add_argument('--weights_file',
                        type=str,
                        metavar='PATH',
                        required=True,
                        help='Path to the model initialization weights.')
    parser.add_argument('--tokenizer_vocab_file',
                        type=str,
                        metavar='PATH',
                        required=True,
                        help='Path to the tokenizer vocabulary.')
    parser.add_argument('--overwrite_cache',
                        action='store_true',
                        help='Overwrite the cache if it already exists.')
    parser.add_argument('--max_sequence_len',
                        type=int,
                        default=384,
                        metavar='N',
                        help='The maximum length of a sequence.')
    parser.add_argument('--max_query_len',
                        type=int,
                        default=64,
                        help='The maximum length of a question.')
    parser.add_argument('--max_answer_len',
                        type=int,
                        default=30,
                        help='The maximum length of an answer.')
    parser.add_argument(
        '--doc_stride',
        type=int,
        default=128,
        help=
        'The stride to take between chunks when splitting a large document.')
    parser.add_argument('--do_lower_case',
                        action='store_true',
                        help='Whether to lowercase the input when tokenizing.')
    parser.add_argument('-n',
                        '--num_epochs',
                        type=int,
                        default=3,
                        metavar='N',
                        help='The number of distillation epochs.')
    parser.add_argument('--per_gpu_train_batch_size',
                        type=int,
                        default=8,
                        metavar='N',
                        help='The batch size per GPU used during training.')
    parser.add_argument('--per_gpu_eval_batch_size',
                        type=int,
                        default=8,
                        metavar='N',
                        help='The batch size per GPU used during evaluation.')
    parser.add_argument('--learning_rate',
                        default=3e-5,
                        type=float,
                        help='The initial learning rate for Adam.')
    parser.add_argument('--epsilon',
                        default=1e-8,
                        type=float,
                        help="Adam's epsilon.")
    parser.add_argument('--num_warmup_steps',
                        default=0,
                        type=int,
                        help='Linear warmup over `warmup_steps`.')
    parser.add_argument(
        '--num_gradient_accumulation_steps',
        type=int,
        default=1,
        metavar='N',
        help=
        'The number of gradient accumulation steps (for larger batch sizes).')
    parser.add_argument('--max_gradient_norm',
                        type=float,
                        default=1.0,
                        metavar='F',
                        help='The maximum gradient norm.')
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        metavar='N',
                        help='Random seed.')
    parser.add_argument('-c',
                        '--use_cuda',
                        action='store_true',
                        help='Whether to use cuda or not.')
    parser.add_argument(
        '-d',
        '--use_distributed',
        action='store_true',
        help='Whether to use distributed training (distillation) or not.')
    parser.add_argument('--local_rank',
                        type=int,
                        default=-1,
                        metavar='N',
                        help='Local process rank.')
    params = parser.parse_args()

    if params.doc_stride >= params.max_sequence_len - params.max_query_len:
        logger.warning(
            "WARNING - You've set a doc stride which may be superior to the document length in some "
            'examples. This could result in errors when building features from the examples. Please reduce the doc '
            'stride or increase the maximum length to ensure the features are correctly built.'
        )

    if not params.use_distributed:
        params.local_rank = 0
        params.train_batch_size = params.per_gpu_train_batch_size
        params.eval_batch_size = params.per_gpu_eval_batch_size
    else:
        params.num_gpus = torch.cuda.device_count()
        params.train_batch_size = params.per_gpu_train_batch_size * params.num_gpus
        params.eval_batch_size = params.per_gpu_eval_batch_size * params.num_gpus
    params.is_master = params.local_rank == 0

    if params.use_cuda:
        device = torch.device('cuda', params.local_rank)
    else:
        device = torch.device('cpu')

    if Path(params.output_dir).is_dir() and not params.force:
        raise ValueError(
            f'Output directory {params.output_dir} already exists. Use `--force` if you want to overwrite it.'
        )
    if params.is_master:
        Path(params.output_dir).mkdir(parents=True, exist_ok=params.force)

        # dump params
        json.dump(vars(params),
                  open(Path(params.output_dir) / 'params.json', 'w'),
                  indent=4,
                  sort_keys=True)
    params.squad_dir = Path(params.squad_dir)
    params.output_dir = Path(params.output_dir)
    params.device = device

    # initialize multi-GPU
    if params.use_distributed:
        if params.is_master:
            logger.info('Initializing PyTorch distributed')
        torch.cuda.set_device(params.local_rank)
        dist.init_process_group(backend='nccl', init_method='env://')

    # set seed(s)
    if params.is_master:
        logger.info('Setting random seed(s)')
    random.seed(params.seed)
    np.random.seed(params.seed)
    torch.manual_seed(params.seed)
    if params.use_distributed:
        torch.cuda.manual_seed_all(params.seed)

    # initialize the tokenizer
    if params.is_master:
        logger.info('Initializing the tokenizer')
    tokenizer = BertTokenizer.from_pretrained(
        params.tokenizer_vocab_file, do_lower_case=params.do_lower_case)

    # initialize the model
    if params.is_master:
        logger.info('Initializing the model')
    config = DistilBertConfig.from_pretrained(params.config_file)
    model = DistilBertForQuestionAnswering.from_pretrained(params.weights_file,
                                                           config=config)

    # send model to device
    model = model.to(params.device)

    # perform the training
    if params.do_train:
        # initialize the training dataset
        if params.is_master:
            logger.info('Initializing the training dataset')
        train_dataset = load_and_cache_examples(
            squad_dir=params.squad_dir,
            split='train',
            tokenizer=tokenizer,
            max_sequence_len=params.max_sequence_len,
            max_query_len=params.max_query_len,
            doc_stride=params.doc_stride,
            output_examples=False,
            overwrite_cache=params.overwrite_cache,
            is_master=params.is_master)

        # initialize the sampler
        if params.is_master:
            logger.info('Initializing the training sampler')
        train_sampler = DistributedSampler(
            train_dataset) if params.use_distributed else RandomSampler(
                train_dataset)

        # initialize the dataloader
        if params.is_master:
            logger.info('Initializing the training dataloader')
        train_dataloader = DataLoader(dataset=train_dataset,
                                      sampler=train_sampler,
                                      batch_size=params.train_batch_size)

        # initialize the optimizer
        if params.is_master:
            logger.info('Initializing the optimizer')
        optimizer = optim.AdamW(
            model.parameters(),
            lr=params.learning_rate,
            eps=params.epsilon,
        )

        # initialize the learning rate scheduler
        if params.is_master:
            logger.info('Initializing the learning rate scheduler')
        num_steps_epoch = len(train_dataloader)
        num_train_steps = math.ceil(num_steps_epoch /
                                    params.num_gradient_accumulation_steps *
                                    params.num_epochs)
        num_warmup_steps = params.num_warmup_steps

        def lr_lambda(current_step):
            if current_step < num_warmup_steps:
                return float(current_step) / float(max(1, num_warmup_steps))
            return max(
                0.0,
                float(num_train_steps - current_step) /
                float(max(1, num_train_steps - num_warmup_steps)))

        lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                                   lr_lambda=lr_lambda,
                                                   last_epoch=-1)

        # initialize distributed data parallel (DDP)
        if params.use_distributed:
            if params.is_master:
                logger.info('Initializing DDP')
            model = DDP(model,
                        device_ids=[params.local_rank],
                        output_device=params.local_rank,
                        find_unused_parameters=True)

        # start training
        if params.is_master:
            logger.info('Starting the training')
        train(model=model,
              num_epochs=params.num_epochs,
              dataloader=train_dataloader,
              optimizer=optimizer,
              lr_scheduler=lr_scheduler,
              num_gradient_accumulation_steps=params.
              num_gradient_accumulation_steps,
              max_gradient_norm=params.max_gradient_norm,
              device=params.device,
              local_rank=params.local_rank,
              use_distributed=params.use_distributed,
              is_master=params.is_master,
              use_tqdm=True,
              logger=logger)

        # save the finetuned model
        if params.is_master:
            # take care of distributed training
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.config.architectures = [
                model_to_save.__class__.__name__
            ]

            logger.info('Saving the finetuned model config')
            json.dump(vars(model_to_save.config),
                      open(params.output_dir /
                           TRAINED_CONFIG_FILE_TEMPLATE.format(
                               model_name=model_to_save.__class__.__name__),
                           mode='w'),
                      indent=4,
                      sort_keys=True)

            logger.info('Saving the finetuned model weights')
            torch.save(
                model_to_save.state_dict(),
                params.output_dir / TRAINED_WEIGHTS_FILE_TEMPLATE.format(
                    model_name=model_to_save.__class__.__name__))

            # reload the model
            if params.do_eval:
                if params.is_master:
                    logger.info('Reloading the model')
                config = DistilBertConfig.from_pretrained(
                    str(params.output_dir /
                        TRAINED_CONFIG_FILE_TEMPLATE.format(
                            model_name=model_to_save.__class__.__name__)))
                model = DistilBertForQuestionAnswering.from_pretrained(
                    str(params.output_dir /
                        TRAINED_WEIGHTS_FILE_TEMPLATE.format(
                            model_name=model_to_save.__class__.__name__)),
                    config=config)
                model = model.to(params.device)

    # perform the evaluation
    if params.do_eval and params.is_master:
        # initialize the training dataset
        logger.info('Initializing the evaluation dataset')
        eval_dataset, examples, features = load_and_cache_examples(
            squad_dir=params.squad_dir,
            split='dev',
            tokenizer=tokenizer,
            max_sequence_len=params.max_sequence_len,
            max_query_len=params.max_query_len,
            doc_stride=params.doc_stride,
            output_examples=True,
            overwrite_cache=params.overwrite_cache,
            is_master=params.is_master)

        # initialize the sampler
        logger.info('Initializing the evaluation sampler')
        eval_sampler = SequentialSampler(eval_dataset)

        # initialize the dataloader
        logger.info('Initializing the evaluation dataloader')
        eval_dataloader = DataLoader(dataset=eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=params.eval_batch_size)

        # start evaluating
        logger.info('Starting the evaluation')
        results = evaluate(output_dir=params.output_dir,
                           model=model,
                           tokenizer=tokenizer,
                           max_answer_len=params.max_answer_len,
                           do_lower_case=params.do_lower_case,
                           dataloader=eval_dataloader,
                           examples=examples,
                           features=features,
                           device=params.device,
                           local_rank=params.local_rank,
                           use_tqdm=True)

        # log results
        logger.info('Evaluation results:')
        for key, result in results.items():
            logger.info(f' {key}: {result}')

        # dump results
        json.dump(results,
                  open(
                      params.output_dir / RESULTS_FILE_TEMPLATE.format(
                          model_name=model.__class__.__name__), 'w'),
                  indent=4)

    if params.is_master:
        logger.info('Done')
Exemple #12
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)
    model = DistilBertForQuestionAnswering.from_pretrained(
        "distilbert-base-uncased")
    if args.do_finetune:
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        for name, param in model.named_parameters():
            if name.startswith("distilbert.embeddings."):
                param.requires_grad = False
            for i in range(args.freeze_layer):
                if name.startswith("distilbert.transformer.layer.%s." % i):
                    param.requires_grad = False
        return
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')

    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        trainer = Trainer(args, log)
        train_dataset, _ = get_dataset(args, args.train_datasets,
                                       args.train_dir, tokenizer, 'train')
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets,
                                            args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(model, train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        model.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                              args.eval_dir, tokenizer,
                                              split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   eval_loader,
                                                   eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering

checkpoint_path = "/Users/minhdang/Desktop/SEPM-Team24/robustqa/save/tapt_distilBert-01/checkpoint"
model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

model.save_pretrained(
    "/Users/minhdang/Desktop/SEPM-Team24/robustqa/robustqa-tapt")
tokenizer.save_pretrained(
    "/Users/minhdang/Desktop/SEPM-Team24/robustqa/robustqa-tapt")
Exemple #14
0
from app_models import MODEL_PATH, ensure_models
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer
from numpy import inf as INFINITY
from torch import tensor, argmax

ensure_models()

model = DistilBertForQuestionAnswering.from_pretrained(MODEL_PATH)
'''
# # Separate question and context
# def separate_question_and_context(input_ids: list, tokenizer: AutoTokenizer):
# 	# The context begins right after this index
# 	sep_index = input_ids.index(tokenizer.sep_token_id)

# 	# Boolean mask for differentiating context and question
# 	segment_ids = (
# 		'0' * (sep_index + 1)
# 		+ '1' * (len(input_ids) - sep_index - 1)
# 	).split('')

# 	return segment_ids

# # Preprocessing
# def preprocessing(context: str, question: str):
# 	# Load tokenizer
# 	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# 	# Encode input
# 	input_ids = tokenizer.encode(question, context)

# 	# Get mask
dev_dataset = SquadDataset(dev_encodings)

from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts,
                            train_questions,
                            truncation=True,
                            padding=True)
dev_encodings = tokenizer(dev_contexts,
                          dev_questions,
                          truncation=True,
                          padding=True)

from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained(
    "distilbert-base-uncased")

from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',  # output directory
    num_train_epochs=1,  # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    warmup_steps=100,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,
    save_strategy="steps",
    logging_strategy="steps",
Exemple #16
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)

    #### Change Made By Xuran Wang: Comment out original lines #######

    # model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
    # tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    #### Change End #######



    #### Change Made By Xuran Wang: Add custom lines #######

    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    finetuned_model_path = 'save/baseline-01/'

    #### Change End #######


    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")


        #### Change Made By Xuran Wang: Add custom lines #######

        checkpoint_path = os.path.join(finetuned_model_path, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)

        #### Change End #######

        args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        model.to(args.device)

        trainer = Trainer(args, log)

        #### Change Made By Xuran Wang: Add custom lines, comment out original line #######

        # train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train')

        train_dataset, _ = get_dataset_eda_revised(args, args.train_datasets, args.train_dir, tokenizer, 'train', train_fraction)

         #### Change End #######

        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                batch_size=args.batch_size,
                                sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(model, train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        model.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(model, eval_loader,
                                                   eval_dict, return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
Exemple #17
0
import torch
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
from torch.utils.mobile_optimizer import optimize_for_mobile

tokenizer = DistilBertTokenizer.from_pretrained(
    'distilbert-base-uncased-distilled-squad')
model = DistilBertForQuestionAnswering.from_pretrained(
    'distilbert-base-uncased-distilled-squad')
model.eval()

question, text = "When will support for GPU be available?!", "There is a growing need to execute ML models on edge devices to reduce latency, preserve privacy and enable new interactive use cases. In the past, engineers used to train models separately. They would then go through a multi-step, error prone and often complex process to transform the models for execution on a mobile device. The mobile runtime was often significantly different from the operations available during training leading to inconsistent developer and eventually user experience. PyTorch Mobile removes these friction surfaces by allowing a seamless process to go from training to deployment by staying entirely within the PyTorch ecosystem. It provides an end-to-end workflow that simplifies the research to production environment for mobile devices. In addition, it paves the way for privacy-preserving features via Federated Learning techniques. PyTorch Mobile is in beta stage right now and in wide scale production use. It will soon be available as a stable release once the APIs are locked down. Key features of PyTorch Mobile: Available for iOS, Android and Linux; Provides APIs that cover common preprocessing and integration tasks needed for incorporating ML in mobile applications; Support for tracing and scripting via TorchScript IR; Support for XNNPACK floating point kernel libraries for Arm CPUs; Integration of QNNPACK for 8-bit quantized kernels. Includes support for per-channel quantization, dynamic quantization and more; Build level optimization and selective compilation depending on the operators needed for user applications, i.e., the final binary size of the app is determined by the actual operators the app needs; Support for hardware backends like GPU, DSP, NPU will be available soon."
# inputs['input_ids'].size() is 360, the maximum size of the input tokens generated from the user question and text
# on mobile apps, if the size of the input tokens of the text and question is less than 360, padding will be needed to make the model work correctly.

inputs = tokenizer(question, text, return_tensors='pt')
model_dynamic_quantized = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8)
traced_model = torch.jit.trace(model_dynamic_quantized,
                               inputs['input_ids'],
                               strict=False)
optimized_traced_model = optimize_for_mobile(traced_model)
torch.jit.save(optimized_traced_model, "qa360_quantized.pt")
# 360 is the length of model input, i.e. the length of the tokenized ids of question+text
Exemple #18
0
"""
SMALL / MEDIUM / DISTIL BASE
"""

from transformers import BertForQuestionAnswering, BertTokenizer, DistilBertForQuestionAnswering, DistilBertTokenizer

bert_small_model = BertForQuestionAnswering.from_pretrained('mrm8488/bert-small-finetuned-squadv2')
bert_small_tokenizer = BertTokenizer.from_pretrained('mrm8488/bert-small-finetuned-squadv2')
print("Bert Small loaded...")

bert_med_model = BertForQuestionAnswering.from_pretrained('mrm8488/bert-medium-finetuned-squadv2')
bert_med_tokenizer = BertTokenizer.from_pretrained('mrm8488/bert-medium-finetuned-squadv2')
print("Bert Medium loaded...")

distil_bert_model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
distil_bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
print("DistilBert loaded...")

"""
ALBERT
"""

from transformers import AlbertTokenizer, AlbertForQuestionAnswering

albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
albert_model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')

"""#### Answer function"""

def answer_question(question, text, alpha=.5):