def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) model = DistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased") tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) if args.resume_training: checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained( checkpoint_path) model.to(args.device) else: args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') trainer = Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train', args.outdomain_data_repeat) log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val', args.outdomain_data_repeat) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=RandomSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) discriminator_input_size = 768 if args.full_adv: discriminator_input_size = 384 * 768 discriminator = DomainDiscriminator( input_size=discriminator_input_size) # discriminator.load_state_dict(torch.load(checkpoint_path + '/discriminator')) model.to(args.device) discriminator.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name, args.outdomain_data_repeat) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, discriminator, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) #### Change Made By Xuran Wang: Comment out original lines ####### # model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") # tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') #### Change End ####### #### Change Made By Xuran Wang: Add custom lines ####### tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') finetuned_model_path = 'save/baseline-01/' #### Change End ####### if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") #### Change Made By Xuran Wang: Add custom lines ####### checkpoint_path = os.path.join(finetuned_model_path, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) #### Change End ####### '''###''' # if args.reinit_pooler: # encoder_temp = getattr(model, "distilbert") # Equivalent to model.distilbert # encoder_temp.pooler.dense.weight.data.normal_(mean=0.0, std=encoder_temp.config.initializer_range) # encoder_temp.pooler.dense.bias.data.zero_() # The change of encoder_temp would affect the model # for p in encoder_temp.pooler.parameters(): # p.requires_grad = True if args.reinit_layers > 0: import torch.nn as nn from transformers.models.distilbert.modeling_distilbert import MultiHeadSelfAttention, FFN # model_distilbert = getattr(model, "distilbert") # model.distilbert; change of model_distilbert affects model! # Reinitialization for the last few layers for layer in model.distilbert.transformer.layer[-args. reinit_layers:]: for module in layer.modules(): # print(module) model.distilbert._init_weights( module) # It's the line equivalent to below approach # if isinstance(module, nn.modules.linear.Linear): # Original form for nn.Linear # # model.config.initializer_range == model.distilbert.config.initializer_range => True # module.weight.data.normal_(mean=0.0, std=model.distilbert.config.initializer_range) # if module.bias is not None: # module.bias.data.zero_() # elif isinstance(module, nn.modules.normalization.LayerNorm): # module.weight.data.fill_(1.0) # module.bias.data.zero_() # elif isinstance(module, FFN): # for param in [module.lin1, module.lin2]: # param.weight.data.normal_(mean=0.0, std=model.distilbert.config.initializer_range) # if param.bias is not None: # param.bias.data.zero_() # elif isinstance(module, MultiHeadSelfAttention): # for param in [module.q_lin, module.k_lin, module.v_lin, module.out_lin]: # param.data.weight.normal_(mean=0.0, std=model.distilbert.config.initializer_range) # if param.bias is not None: # param.bias.data.zero_() args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(args.device) trainer = Trainer(args, log) #### Change Made By Xuran Wang: Add custom lines, comment out original line ####### # train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') train_dataset, _ = get_dataset_eda_revised(args, args.train_datasets, args.train_dir, tokenizer, 'train', train_fraction) #### Change End ####### log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): """ Main function """ # Parse cmd line arguments args = nlp_parser.parse_arguments() source = "" subject = "" context = "" question = "" answer = "" # Setup the question, either from a specified SQuAD record # or from cmd line arguments. # If no question details are provided, a random # SQuAD example will be chosen. if args["question"] is not None: question = args["question"] if args["text"] is not None: source = args["text"] with open(source, "r") as text_file_handle: context = text_file_handle.read() else: print("No text provided, searching SQuAD dev-2.0 dataset") squad_data = nlp.import_squad_data() squad_records = squad_data.loc[squad_data["question"] == question] if squad_records.empty: sys.exit( "Question not found in SQuAD data, please provide context using `--text`." ) subject = squad_records["subject"].iloc[0] context = squad_records["context"].iloc[0] question = squad_records["question"].iloc[0] answer = squad_records["answer"] else: squad_data = nlp.import_squad_data() if args["squadid"] is not None: source = args["squadid"] squad_records = squad_data.loc[squad_data["id"] == source] i_record = 0 else: if args["subject"] is not None: print( "Picking a question at random on the subject: ", args["subject"], ) squad_records = squad_data.loc[ squad_data["subject"] == args["subject"] ] else: print( "No SQuAD ID or question provided, picking one at random!" ) squad_records = squad_data n_records = len(squad_records.index) i_record = random.randint(0, max(0, n_records - 1)) if squad_records.empty: sys.exit( "No questions found in SQuAD data, please provide valid ID or subject." ) n_records = len(squad_records.index) i_record = random.randint(0, n_records - 1) source = squad_records["id"].iloc[i_record] subject = squad_records["subject"].iloc[i_record] context = squad_records["context"].iloc[i_record] question = squad_records["question"].iloc[i_record] answer = squad_records["answer"].iloc[i_record] # DistilBERT question answering using pre-trained model. token = DistilBertTokenizer.from_pretrained( "distilbert-base-uncased", return_token_type_ids=True ) model = DistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased-distilled-squad" ) encoding = token.encode_plus(question, context) input_ids, attention_mask = ( encoding["input_ids"], encoding["attention_mask"], ) start_scores, end_scores = model( torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]), return_dict=False, ) answer_ids = input_ids[ torch.argmax(start_scores) : torch.argmax(end_scores) + 1 ] answer_tokens = token.convert_ids_to_tokens( answer_ids, skip_special_tokens=True ) answer_tokens_to_string = token.convert_tokens_to_string(answer_tokens) # Display results print("\nDistilBERT question answering example.") print("======================================") print("Reading from: ", subject, source) print("\nContext: ", context) print("--") print("Question: ", question) print("Answer: ", answer_tokens_to_string) print("Reference Answers: ", answer)
return contexts, questions, ids device = 'cuda' if torch.cuda.is_available() else 'cpu' if device == 'cuda': torch.cuda.set_device(DEVICE_ID) # use an unoccupied GPU ''' load data ''' val_contexts, val_questions, val_ids = read_squad('data/dev-v2.0.json') ''' tokenizers and models ''' tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') model = DistilBertForQuestionAnswering.from_pretrained( 'distilbert-base-uncased').to(device) model.load_state_dict( torch.load(os.path.join('model_weights', f'distilBERT_epoch_{NUM_EPOCH}.pt'), map_location=device)) model.eval() res = dict() with torch.no_grad(): for i, (context, question, id) in tqdm(enumerate(zip(val_contexts, val_questions, val_ids))): encoding = tokenizer(context, question, return_tensors='pt', truncation=True)
# argument parsing app = Flask(__name__) api = Api(app) parser = reqparse.RequestParser() parser.add_argument('question') N_HITS = 10 # TODO: Analyse the hard-coded keywords and assess if anything needs to change here. KEYWORDS = '' # LUCENE_DATABASE_DIR = '/mnt/lucene-database' LUCENE_DATABASE_PATH = 'lucene-index-covid-2020-04-10' # Load these models locally - distilbert-base-uncased-distilled-squad DISTILBERT_MODEL_PATH = 'distilbert-base-uncased-distilled-squad' model = DistilBertForQuestionAnswering.from_pretrained(DISTILBERT_MODEL_PATH) tokenizer = DistilBertTokenizer.from_pretrained(DISTILBERT_MODEL_PATH) # document = "Victoria has a written constitution enacted in 1975, but based on the 1855 colonial constitution, passed by the United Kingdom Parliament as the Victoria Constitution Act 1855, which establishes the Parliament as the state's law-making body for matters coming under state responsibility. The Victorian Constitution can be amended by the Parliament of Victoria, except for certain 'entrenched' provisions that require either an absolute majority in both houses, a three-fifths majority in both houses, or the approval of the Victorian people in a referendum, depending on the provision." # input_ids = tokenizer.encode('Why is this strange thing here?') start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) # start_scores, end_scores = model(torch.tensor([input_ids[:512]])) def makeBERTSQuADPrediction(model, document, question): input_ids = tokenizer.encode(question, document) tokens = tokenizer.convert_ids_to_tokens(input_ids) sep_index = input_ids.index(tokenizer.sep_token_id) num_seg_a = sep_index + 1 num_seg_b = len(input_ids) - num_seg_a
def __init__(self): self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',return_token_type_ids = True) self.model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.model = DistilBertForQuestionAnswering.from_pretrained( self.model_dir) self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_dir)
class BERTQA: tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', return_token_type_ids=True) model = DistilBertForQuestionAnswering.from_pretrained( 'distilbert-base-uncased-distilled-squad') MAX_TOKENS = 512 MAX_TOKENS_QUESTION = 30 MAX_TOKENS_DOCUMENT = MAX_TOKENS - MAX_TOKENS_QUESTION - 2 # [SEP] and [CLS] def __init__(self): pass def get_token_length(self, string): tokens = self.tokenizer.encode(string) return len(tokens) def chunk_document(self, document, re_consolidate=True): '''Chunks up a long document into optimally large pieces so that they can be passed to BERT. Activating `re_consolidate` will put the chunks back together to make them as large as possible for improved performance. ''' document_length = self.get_token_length(document) if document_length > self.MAX_TOKENS_DOCUMENT: approved_chunks = [] paragraphs = document.split('\n') paragraphs = [par for par in paragraphs if par] for paragraph in paragraphs: paragraph_length = self.get_token_length(paragraph) if paragraph_length > self.MAX_TOKENS_DOCUMENT: sentences = paragraph.split('.') sentences = [sen for sen in sentences if sen] for sentence in sentences: sentence_length = self.get_token_length(sentence) if sentence_length > self.MAX_TOKENS_DOCUMENT: print("Ignoring overlong sentence.") else: approved_chunks.append(sentence) else: approved_chunks.append(paragraph) if re_consolidate: lengths = [ self.get_token_length(chunk) for chunk in approved_chunks ] consolidated_chunks = [] running_length = 0 current_chunk = '' for chunk, length in zip(approved_chunks, lengths): if (running_length + length) < self.MAX_TOKENS_DOCUMENT: current_chunk += chunk running_length += length else: consolidated_chunks.append(current_chunk) current_chunk = chunk running_length = length return consolidated_chunks else: return approved_chunks else: return [document] def answer_question(self, question, document): '''Takes a `question` string and an `document` string (which contains the answer), and identifies the words within the `document` that are the answer. ''' question_length = self.get_token_length(question) document_length = self.get_token_length(document) if question_length > self.MAX_TOKENS_QUESTION: msg = f'Question exceeds max token length ({str(question_length)}).' raise ValueError(msg) if document_length > self.MAX_TOKENS_DOCUMENT: msg = f'Document exceeds max token length ({str(document_length)}).' raise ValueError(msg) encoding = self.tokenizer.encode_plus(question, document) input_ids, attention_mask = encoding["input_ids"], encoding[ "attention_mask"] start_scores, end_scores = self.model(torch.tensor([input_ids]), attention_mask=torch.tensor( [attention_mask])) confidence = float(max(torch.max(start_scores), torch.max(end_scores))) start_token = torch.argmax(start_scores) end_token = torch.argmax(end_scores) ans_tokens = input_ids[torch.argmax(start_scores ):torch.argmax(end_scores) + 1] answer_tokens = self.tokenizer.convert_ids_to_tokens( ans_tokens, skip_special_tokens=True) if not answer_tokens: # TODO Understand this bug return '<NO ANSWER>', -10 else: answer = answer_tokens[0] for token in answer_tokens[1:]: if token[0:2] == '##': answer += token[2:] else: answer += ' ' + token return answer, confidence def answer_question_chunked(self, question, document, re_consolidate=True): chunks = self.chunk_document(document, re_consolidate=True) responses = [] for chunk in tqdm(chunks): answer, confidence = self.answer_question(question, chunk) response = { 'answer': answer, 'confidence': confidence, 'chunk': chunk } responses.append(response) responses.sort(key=lambda x: -x['confidence']) return responses
def do_prediction(model_dir): # 1. Load a trained model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = DistilBertForQuestionAnswering.from_pretrained(model_dir) model.to(device) model.eval() # 2. Load and pre-process the test set dev_file = "data/sfu.json" predict_batch_size = 2 max_seq_length = 384 eval_examples = read_squad_examples(input_file=dev_file, is_training=False, version_2_with_negative=False) tokenizer = DistilBertTokenizer.from_pretrained(model_dir) eval_features = convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=128, max_query_length=64, is_training=False) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=predict_batch_size) # 3. Run inference on the test set all_results = [] for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(model_dir, "predictions_sfu.json") output_nbest_file = os.path.join(model_dir, "nbest_predictions_sfu.json") output_null_log_odds_file = os.path.join(model_dir, "null_odds_sfu.json") preds = write_predictions(eval_examples, eval_features, all_results, 20, 30, True, output_prediction_file, output_nbest_file, output_null_log_odds_file, True, False, 0.0)
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) model = DistilBertForQuestionAnswering.from_pretrained( args.model_checkpoint) tokenizer = DistilBertTokenizerFast.from_pretrained(args.model_checkpoint) with wandb.init(project="qa-system", config=args) as run: run.name = args.run_name wandb.watch(model) if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info( f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') trainer = Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.val_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) model_artifact = wandb.Artifact( args.run_name, type="model", ) model_artifact.add_dir(os.path.join(args.save_dir, 'checkpoint')) run.log_artifact(model_artifact) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) if args.checkpoint_path != "": model = DistilBertForQuestionAnswering.from_pretrained( args.checkpoint_path) else: checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained( checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): parser = ArgumentParser('SQuAD evaluation example') parser.add_argument( '--squad_dir', type=str, metavar='PATH', required=True, help='Path to directory containing the SQuAD data (JSON-files).') parser.add_argument( '--output_dir', type=str, metavar='PATH', required=True, help= 'Path to the output directory (for logs, checkpoints, parameters, etc.).' ) parser.add_argument('-f', '--force', action='store_true', help='Overwrite `output_dir` if it already exists.') parser.add_argument('--do_train', action='store_true', help='Whether to run training.') parser.add_argument('--do_eval', action='store_true', help='Whether to run eval (on the dev set).') parser.add_argument('--config_file', type=str, metavar='PATH', required=True, help='Path to the model configuration.') parser.add_argument('--weights_file', type=str, metavar='PATH', required=True, help='Path to the model initialization weights.') parser.add_argument('--tokenizer_vocab_file', type=str, metavar='PATH', required=True, help='Path to the tokenizer vocabulary.') parser.add_argument('--overwrite_cache', action='store_true', help='Overwrite the cache if it already exists.') parser.add_argument('--max_sequence_len', type=int, default=384, metavar='N', help='The maximum length of a sequence.') parser.add_argument('--max_query_len', type=int, default=64, help='The maximum length of a question.') parser.add_argument('--max_answer_len', type=int, default=30, help='The maximum length of an answer.') parser.add_argument( '--doc_stride', type=int, default=128, help= 'The stride to take between chunks when splitting a large document.') parser.add_argument('--do_lower_case', action='store_true', help='Whether to lowercase the input when tokenizing.') parser.add_argument('-n', '--num_epochs', type=int, default=3, metavar='N', help='The number of distillation epochs.') parser.add_argument('--per_gpu_train_batch_size', type=int, default=8, metavar='N', help='The batch size per GPU used during training.') parser.add_argument('--per_gpu_eval_batch_size', type=int, default=8, metavar='N', help='The batch size per GPU used during evaluation.') parser.add_argument('--learning_rate', default=3e-5, type=float, help='The initial learning rate for Adam.') parser.add_argument('--epsilon', default=1e-8, type=float, help="Adam's epsilon.") parser.add_argument('--num_warmup_steps', default=0, type=int, help='Linear warmup over `warmup_steps`.') parser.add_argument( '--num_gradient_accumulation_steps', type=int, default=1, metavar='N', help= 'The number of gradient accumulation steps (for larger batch sizes).') parser.add_argument('--max_gradient_norm', type=float, default=1.0, metavar='F', help='The maximum gradient norm.') parser.add_argument('--seed', type=int, default=42, metavar='N', help='Random seed.') parser.add_argument('-c', '--use_cuda', action='store_true', help='Whether to use cuda or not.') parser.add_argument( '-d', '--use_distributed', action='store_true', help='Whether to use distributed training (distillation) or not.') parser.add_argument('--local_rank', type=int, default=-1, metavar='N', help='Local process rank.') params = parser.parse_args() if params.doc_stride >= params.max_sequence_len - params.max_query_len: logger.warning( "WARNING - You've set a doc stride which may be superior to the document length in some " 'examples. This could result in errors when building features from the examples. Please reduce the doc ' 'stride or increase the maximum length to ensure the features are correctly built.' ) if not params.use_distributed: params.local_rank = 0 params.train_batch_size = params.per_gpu_train_batch_size params.eval_batch_size = params.per_gpu_eval_batch_size else: params.num_gpus = torch.cuda.device_count() params.train_batch_size = params.per_gpu_train_batch_size * params.num_gpus params.eval_batch_size = params.per_gpu_eval_batch_size * params.num_gpus params.is_master = params.local_rank == 0 if params.use_cuda: device = torch.device('cuda', params.local_rank) else: device = torch.device('cpu') if Path(params.output_dir).is_dir() and not params.force: raise ValueError( f'Output directory {params.output_dir} already exists. Use `--force` if you want to overwrite it.' ) if params.is_master: Path(params.output_dir).mkdir(parents=True, exist_ok=params.force) # dump params json.dump(vars(params), open(Path(params.output_dir) / 'params.json', 'w'), indent=4, sort_keys=True) params.squad_dir = Path(params.squad_dir) params.output_dir = Path(params.output_dir) params.device = device # initialize multi-GPU if params.use_distributed: if params.is_master: logger.info('Initializing PyTorch distributed') torch.cuda.set_device(params.local_rank) dist.init_process_group(backend='nccl', init_method='env://') # set seed(s) if params.is_master: logger.info('Setting random seed(s)') random.seed(params.seed) np.random.seed(params.seed) torch.manual_seed(params.seed) if params.use_distributed: torch.cuda.manual_seed_all(params.seed) # initialize the tokenizer if params.is_master: logger.info('Initializing the tokenizer') tokenizer = BertTokenizer.from_pretrained( params.tokenizer_vocab_file, do_lower_case=params.do_lower_case) # initialize the model if params.is_master: logger.info('Initializing the model') config = DistilBertConfig.from_pretrained(params.config_file) model = DistilBertForQuestionAnswering.from_pretrained(params.weights_file, config=config) # send model to device model = model.to(params.device) # perform the training if params.do_train: # initialize the training dataset if params.is_master: logger.info('Initializing the training dataset') train_dataset = load_and_cache_examples( squad_dir=params.squad_dir, split='train', tokenizer=tokenizer, max_sequence_len=params.max_sequence_len, max_query_len=params.max_query_len, doc_stride=params.doc_stride, output_examples=False, overwrite_cache=params.overwrite_cache, is_master=params.is_master) # initialize the sampler if params.is_master: logger.info('Initializing the training sampler') train_sampler = DistributedSampler( train_dataset) if params.use_distributed else RandomSampler( train_dataset) # initialize the dataloader if params.is_master: logger.info('Initializing the training dataloader') train_dataloader = DataLoader(dataset=train_dataset, sampler=train_sampler, batch_size=params.train_batch_size) # initialize the optimizer if params.is_master: logger.info('Initializing the optimizer') optimizer = optim.AdamW( model.parameters(), lr=params.learning_rate, eps=params.epsilon, ) # initialize the learning rate scheduler if params.is_master: logger.info('Initializing the learning rate scheduler') num_steps_epoch = len(train_dataloader) num_train_steps = math.ceil(num_steps_epoch / params.num_gradient_accumulation_steps * params.num_epochs) num_warmup_steps = params.num_warmup_steps def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) return max( 0.0, float(num_train_steps - current_step) / float(max(1, num_train_steps - num_warmup_steps))) lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lr_lambda, last_epoch=-1) # initialize distributed data parallel (DDP) if params.use_distributed: if params.is_master: logger.info('Initializing DDP') model = DDP(model, device_ids=[params.local_rank], output_device=params.local_rank, find_unused_parameters=True) # start training if params.is_master: logger.info('Starting the training') train(model=model, num_epochs=params.num_epochs, dataloader=train_dataloader, optimizer=optimizer, lr_scheduler=lr_scheduler, num_gradient_accumulation_steps=params. num_gradient_accumulation_steps, max_gradient_norm=params.max_gradient_norm, device=params.device, local_rank=params.local_rank, use_distributed=params.use_distributed, is_master=params.is_master, use_tqdm=True, logger=logger) # save the finetuned model if params.is_master: # take care of distributed training model_to_save = model.module if hasattr(model, 'module') else model model_to_save.config.architectures = [ model_to_save.__class__.__name__ ] logger.info('Saving the finetuned model config') json.dump(vars(model_to_save.config), open(params.output_dir / TRAINED_CONFIG_FILE_TEMPLATE.format( model_name=model_to_save.__class__.__name__), mode='w'), indent=4, sort_keys=True) logger.info('Saving the finetuned model weights') torch.save( model_to_save.state_dict(), params.output_dir / TRAINED_WEIGHTS_FILE_TEMPLATE.format( model_name=model_to_save.__class__.__name__)) # reload the model if params.do_eval: if params.is_master: logger.info('Reloading the model') config = DistilBertConfig.from_pretrained( str(params.output_dir / TRAINED_CONFIG_FILE_TEMPLATE.format( model_name=model_to_save.__class__.__name__))) model = DistilBertForQuestionAnswering.from_pretrained( str(params.output_dir / TRAINED_WEIGHTS_FILE_TEMPLATE.format( model_name=model_to_save.__class__.__name__)), config=config) model = model.to(params.device) # perform the evaluation if params.do_eval and params.is_master: # initialize the training dataset logger.info('Initializing the evaluation dataset') eval_dataset, examples, features = load_and_cache_examples( squad_dir=params.squad_dir, split='dev', tokenizer=tokenizer, max_sequence_len=params.max_sequence_len, max_query_len=params.max_query_len, doc_stride=params.doc_stride, output_examples=True, overwrite_cache=params.overwrite_cache, is_master=params.is_master) # initialize the sampler logger.info('Initializing the evaluation sampler') eval_sampler = SequentialSampler(eval_dataset) # initialize the dataloader logger.info('Initializing the evaluation dataloader') eval_dataloader = DataLoader(dataset=eval_dataset, sampler=eval_sampler, batch_size=params.eval_batch_size) # start evaluating logger.info('Starting the evaluation') results = evaluate(output_dir=params.output_dir, model=model, tokenizer=tokenizer, max_answer_len=params.max_answer_len, do_lower_case=params.do_lower_case, dataloader=eval_dataloader, examples=examples, features=features, device=params.device, local_rank=params.local_rank, use_tqdm=True) # log results logger.info('Evaluation results:') for key, result in results.items(): logger.info(f' {key}: {result}') # dump results json.dump(results, open( params.output_dir / RESULTS_FILE_TEMPLATE.format( model_name=model.__class__.__name__), 'w'), indent=4) if params.is_master: logger.info('Done')
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) model = DistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased") if args.do_finetune: checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) for name, param in model.named_parameters(): if name.startswith("distilbert.embeddings."): param.requires_grad = False for i in range(args.freeze_layer): if name.startswith("distilbert.transformer.layer.%s." % i): param.requires_grad = False return tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') trainer = Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
from transformers import DistilBertTokenizerFast from transformers import DistilBertForQuestionAnswering checkpoint_path = "/Users/minhdang/Desktop/SEPM-Team24/robustqa/save/tapt_distilBert-01/checkpoint" model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") model.save_pretrained( "/Users/minhdang/Desktop/SEPM-Team24/robustqa/robustqa-tapt") tokenizer.save_pretrained( "/Users/minhdang/Desktop/SEPM-Team24/robustqa/robustqa-tapt")
from app_models import MODEL_PATH, ensure_models from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer from numpy import inf as INFINITY from torch import tensor, argmax ensure_models() model = DistilBertForQuestionAnswering.from_pretrained(MODEL_PATH) ''' # # Separate question and context # def separate_question_and_context(input_ids: list, tokenizer: AutoTokenizer): # # The context begins right after this index # sep_index = input_ids.index(tokenizer.sep_token_id) # # Boolean mask for differentiating context and question # segment_ids = ( # '0' * (sep_index + 1) # + '1' * (len(input_ids) - sep_index - 1) # ).split('') # return segment_ids # # Preprocessing # def preprocessing(context: str, question: str): # # Load tokenizer # tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) # # Encode input # input_ids = tokenizer.encode(question, context) # # Get mask
dev_dataset = SquadDataset(dev_encodings) from transformers import DistilBertTokenizerFast tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True) dev_encodings = tokenizer(dev_contexts, dev_questions, truncation=True, padding=True) from transformers import DistilBertForQuestionAnswering model = DistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased") from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments training_args = TrainingArguments( output_dir='./results', # output directory num_train_epochs=1, # total number of training epochs per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation warmup_steps=100, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=10, load_best_model_at_end=True, save_strategy="steps", logging_strategy="steps",
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) #### Change Made By Xuran Wang: Comment out original lines ####### # model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") # tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') #### Change End ####### #### Change Made By Xuran Wang: Add custom lines ####### tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') finetuned_model_path = 'save/baseline-01/' #### Change End ####### if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") #### Change Made By Xuran Wang: Add custom lines ####### checkpoint_path = os.path.join(finetuned_model_path, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) #### Change End ####### args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(args.device) trainer = Trainer(args, log) #### Change Made By Xuran Wang: Add custom lines, comment out original line ####### # train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') train_dataset, _ = get_dataset_eda_revised(args, args.train_datasets, args.train_dir, tokenizer, 'train', train_fraction) #### Change End ####### log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
import torch from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering from torch.utils.mobile_optimizer import optimize_for_mobile tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased-distilled-squad') model = DistilBertForQuestionAnswering.from_pretrained( 'distilbert-base-uncased-distilled-squad') model.eval() question, text = "When will support for GPU be available?!", "There is a growing need to execute ML models on edge devices to reduce latency, preserve privacy and enable new interactive use cases. In the past, engineers used to train models separately. They would then go through a multi-step, error prone and often complex process to transform the models for execution on a mobile device. The mobile runtime was often significantly different from the operations available during training leading to inconsistent developer and eventually user experience. PyTorch Mobile removes these friction surfaces by allowing a seamless process to go from training to deployment by staying entirely within the PyTorch ecosystem. It provides an end-to-end workflow that simplifies the research to production environment for mobile devices. In addition, it paves the way for privacy-preserving features via Federated Learning techniques. PyTorch Mobile is in beta stage right now and in wide scale production use. It will soon be available as a stable release once the APIs are locked down. Key features of PyTorch Mobile: Available for iOS, Android and Linux; Provides APIs that cover common preprocessing and integration tasks needed for incorporating ML in mobile applications; Support for tracing and scripting via TorchScript IR; Support for XNNPACK floating point kernel libraries for Arm CPUs; Integration of QNNPACK for 8-bit quantized kernels. Includes support for per-channel quantization, dynamic quantization and more; Build level optimization and selective compilation depending on the operators needed for user applications, i.e., the final binary size of the app is determined by the actual operators the app needs; Support for hardware backends like GPU, DSP, NPU will be available soon." # inputs['input_ids'].size() is 360, the maximum size of the input tokens generated from the user question and text # on mobile apps, if the size of the input tokens of the text and question is less than 360, padding will be needed to make the model work correctly. inputs = tokenizer(question, text, return_tensors='pt') model_dynamic_quantized = torch.quantization.quantize_dynamic( model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8) traced_model = torch.jit.trace(model_dynamic_quantized, inputs['input_ids'], strict=False) optimized_traced_model = optimize_for_mobile(traced_model) torch.jit.save(optimized_traced_model, "qa360_quantized.pt") # 360 is the length of model input, i.e. the length of the tokenized ids of question+text
""" SMALL / MEDIUM / DISTIL BASE """ from transformers import BertForQuestionAnswering, BertTokenizer, DistilBertForQuestionAnswering, DistilBertTokenizer bert_small_model = BertForQuestionAnswering.from_pretrained('mrm8488/bert-small-finetuned-squadv2') bert_small_tokenizer = BertTokenizer.from_pretrained('mrm8488/bert-small-finetuned-squadv2') print("Bert Small loaded...") bert_med_model = BertForQuestionAnswering.from_pretrained('mrm8488/bert-medium-finetuned-squadv2') bert_med_tokenizer = BertTokenizer.from_pretrained('mrm8488/bert-medium-finetuned-squadv2') print("Bert Medium loaded...") distil_bert_model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased') distil_bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') print("DistilBert loaded...") """ ALBERT """ from transformers import AlbertTokenizer, AlbertForQuestionAnswering albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') albert_model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2') """#### Answer function""" def answer_question(question, text, alpha=.5):