def main(args): init_logger() set_seed(args) tokenizer = load_tokenizer(args) train_dataset = load_and_cache_examples(args, tokenizer, mode="train") dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev") test_dataset = load_and_cache_examples(args, tokenizer, mode="test") trainer = Trainer(args, train_dataset, dev_dataset, test_dataset) if args.do_train: trainer.train() if args.do_eval: trainer.load_model() print('Test Result:') trainer.eval("test")
def main(): #parse arguments config.parse() args = config.args for k, v in vars(args).items(): logger.info(f"{k}:{v}") #set seeds torch.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) np.random.seed(args.random_seed) random.seed(args.random_seed) #arguments check device, n_gpu = args_check(args) os.makedirs(args.output_dir, exist_ok=True) forward_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) args.forward_batch_size = forward_batch_size #load bert config bert_config_S = BertConfig.from_json_file(args.bert_config_file_S) assert args.max_seq_length <= bert_config_S.max_position_embeddings #Prepare GLUE task processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] # eg: MNLI,['contradiction', 'entailment', 'neutral'] --> [“矛盾”,“必然”,“中立”] label_list = processor.get_labels() num_labels = len(label_list) #read data train_dataset = None eval_datasets = None num_train_steps = None tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) # 加载数据集, 计算steps if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) if args.aux_task_name: aux_train_dataset = load_and_cache_examples(args, args.aux_task_name, tokenizer, evaluate=False, is_aux=True) train_dataset = torch.utils.data.ConcatDataset( [train_dataset, aux_train_dataset]) num_train_steps = int( len(train_dataset) / args.train_batch_size) * args.num_train_epochs if args.do_predict: eval_datasets = [] eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) for eval_task in eval_task_names: eval_datasets.append( load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)) logger.info("数据集已加载") #加载模型并初始化, 只用student模型,其实这里相当于在MNLI数据上训练教师模型,只训练一个模型 model_S = BertForGLUESimple(bert_config_S, num_labels=num_labels, args=args) #初始化student模型 if args.load_model_type == 'bert': assert args.init_checkpoint_S is not None state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu') if args.only_load_embedding: state_weight = { k[5:]: v for k, v in state_dict_S.items() if k.startswith('bert.embeddings') } missing_keys, _ = model_S.bert.load_state_dict(state_weight, strict=False) logger.info(f"Missing keys {list(missing_keys)}") else: state_weight = { k[5:]: v for k, v in state_dict_S.items() if k.startswith('bert.') } missing_keys, _ = model_S.bert.load_state_dict(state_weight, strict=False) assert len(missing_keys) == 0 logger.info("Model loaded") elif args.load_model_type == 'all': assert args.tuned_checkpoint_S is not None state_dict_S = torch.load(args.tuned_checkpoint_S, map_location='cpu') model_S.load_state_dict(state_dict_S) logger.info("Model loaded") else: logger.info("Model is randomly initialized.") model_S.to(device) if args.local_rank != -1 or n_gpu > 1: if args.local_rank != -1: raise NotImplementedError elif n_gpu > 1: model_S = torch.nn.DataParallel(model_S) #,output_device=n_gpu-1) if args.do_train: #parameters params = list(model_S.named_parameters()) all_trainable_params = divide_parameters(params, lr=args.learning_rate) logger.info("Length of all_trainable_params: %d", len(all_trainable_params)) # 优化器设置 optimizer = BERTAdam(all_trainable_params, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps, schedule=args.schedule, s_opt1=args.s_opt1, s_opt2=args.s_opt2, s_opt3=args.s_opt3) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Forward batch size = %d", forward_batch_size) logger.info(" Num backward steps = %d", num_train_steps) ########### 蒸馏 ########### train_config = TrainingConfig( gradient_accumulation_steps=args.gradient_accumulation_steps, ckpt_frequency=args.ckpt_frequency, log_dir=args.output_dir, output_dir=args.output_dir, device=args.device) #执行监督训练,而不是蒸馏。它可以用于训练teacher模型。初始化模型 distiller = BasicTrainer(train_config=train_config, model=model_S, adaptor=BertForGLUESimpleAdaptorTraining) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: raise NotImplementedError train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.forward_batch_size, drop_last=True) callback_func = partial(predict, eval_datasets=eval_datasets, args=args) with distiller: distiller.train(optimizer, scheduler=None, dataloader=train_dataloader, num_epochs=args.num_train_epochs, callback=callback_func) if not args.do_train and args.do_predict: res = predict(model_S, eval_datasets, step=0, args=args) print(res)
def main(): #parse arguments config.parse() args = config.args for k, v in vars(args).items(): logger.info(f"{k}:{v}") #set seeds torch.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) np.random.seed(args.random_seed) random.seed(args.random_seed) #arguments check device, n_gpu = args_check(args) os.makedirs(args.output_dir, exist_ok=True) forward_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) args.forward_batch_size = forward_batch_size #load bert config bert_config_T = BertConfig.from_json_file(args.bert_config_file_T) bert_config_S = BertConfig.from_json_file(args.bert_config_file_S) assert args.max_seq_length <= bert_config_T.max_position_embeddings assert args.max_seq_length <= bert_config_S.max_position_embeddings #Prepare GLUE task processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) #read data train_dataset = None eval_datasets = None num_train_steps = None tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) # 加载数据集 if args.do_train: train_dataset, examples = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) if args.aux_task_name: aux_train_dataset, examples = load_and_cache_examples( args, args.aux_task_name, tokenizer, evaluate=False, is_aux=True) train_dataset = torch.utils.data.ConcatDataset( [train_dataset, aux_train_dataset]) num_train_steps = int( len(train_dataset) / args.train_batch_size) * args.num_train_epochs if args.do_predict: eval_datasets = [] eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) for eval_task in eval_task_names: eval_dataset, examples = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) eval_datasets.append(eval_dataset) logger.info("数据集加载成功") #加载模型,加载teacher和student模型 model_T = BertForGLUESimple(bert_config_T, num_labels=num_labels, args=args) model_S = BertForGLUESimple(bert_config_S, num_labels=num_labels, args=args) #加载teacher模型参数 if args.tuned_checkpoint_T is not None: state_dict_T = torch.load(args.tuned_checkpoint_T, map_location='cpu') model_T.load_state_dict(state_dict_T) model_T.eval() else: assert args.do_predict is True #Load student if args.load_model_type == 'bert': assert args.init_checkpoint_S is not None state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu') if args.only_load_embedding: state_weight = { k[5:]: v for k, v in state_dict_S.items() if k.startswith('bert.embeddings') } missing_keys, _ = model_S.bert.load_state_dict(state_weight, strict=False) logger.info(f"Missing keys {list(missing_keys)}") else: state_weight = { k[5:]: v for k, v in state_dict_S.items() if k.startswith('bert.') } missing_keys, _ = model_S.bert.load_state_dict(state_weight, strict=False) assert len(missing_keys) == 0 logger.info("Model loaded") elif args.load_model_type == 'all': assert args.tuned_checkpoint_S is not None state_dict_S = torch.load(args.tuned_checkpoint_S, map_location='cpu') model_S.load_state_dict(state_dict_S) logger.info("Model loaded") else: logger.info("Student模型没有可加载参数,随机初始化参数 randomly initialized.") model_T.to(device) model_S.to(device) if args.local_rank != -1 or n_gpu > 1: if args.local_rank != -1: raise NotImplementedError elif n_gpu > 1: model_T = torch.nn.DataParallel(model_T) #,output_device=n_gpu-1) model_S = torch.nn.DataParallel(model_S) #,output_device=n_gpu-1) if args.do_train: #parameters params = list(model_S.named_parameters()) all_trainable_params = divide_parameters(params, lr=args.learning_rate) logger.info("Length of all_trainable_params: %d", len(all_trainable_params)) #优化器配置 optimizer = BERTAdam(all_trainable_params, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps, schedule=args.schedule, s_opt1=args.s_opt1, s_opt2=args.s_opt2, s_opt3=args.s_opt3) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Forward batch size = %d", forward_batch_size) logger.info(" Num backward steps = %d", num_train_steps) ########### DISTILLATION ########### train_config = TrainingConfig( gradient_accumulation_steps=args.gradient_accumulation_steps, ckpt_frequency=args.ckpt_frequency, log_dir=args.output_dir, output_dir=args.output_dir, device=args.device) # 定义了一些固定的matches配置文件 from matches import matches intermediate_matches = None if isinstance(args.matches, (list, tuple)): intermediate_matches = [] for match in args.matches: intermediate_matches += matches[match] logger.info(f"中间层match信息: {intermediate_matches}") distill_config = DistillationConfig( temperature=args.temperature, intermediate_matches=intermediate_matches) logger.info(f"训练配置: {train_config}") logger.info(f"蒸馏配置: {distill_config}") adaptor_T = partial(BertForGLUESimpleAdaptor, no_logits=args.no_logits, no_mask=args.no_inputs_mask) adaptor_S = partial(BertForGLUESimpleAdaptor, no_logits=args.no_logits, no_mask=args.no_inputs_mask) # 支持中间状态匹配的通用蒸馏模型 distiller = GeneralDistiller(train_config=train_config, distill_config=distill_config, model_T=model_T, model_S=model_S, adaptor_T=adaptor_T, adaptor_S=adaptor_S) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: raise NotImplementedError train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.forward_batch_size, drop_last=True) callback_func = partial(predict, eval_datasets=eval_datasets, args=args, examples=examples) with distiller: distiller.train(optimizer, scheduler=None, dataloader=train_dataloader, num_epochs=args.num_train_epochs, callback=callback_func) if not args.do_train and args.do_predict: res = predict(model_S, eval_datasets, step=0, args=args, examples=examples, label_list=label_list) print(res)
def main(): #parse arguments config.parse() args = config.args for k, v in vars(args).items(): logger.info(f"{k}:{v}") #set seeds torch.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) np.random.seed(args.random_seed) random.seed(args.random_seed) #arguments check device, n_gpu = args_check(args) os.makedirs(args.output_dir, exist_ok=True) forward_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) args.forward_batch_size = forward_batch_size #load config teachers_and_student = parse_model_config(args.model_config_json) #Prepare GLUE task processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) #read data train_dataset = None eval_datasets = None num_train_steps = None tokenizer_S = teachers_and_student['student']['tokenizer'] prefix_S = teachers_and_student['student']['prefix'] if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer_S, prefix=prefix_S, evaluate=False) if args.do_predict: eval_datasets = [] eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) for eval_task in eval_task_names: eval_datasets.append( load_and_cache_examples(args, eval_task, tokenizer_S, prefix=prefix_S, evaluate=True)) logger.info("Data loaded") #Build Model and load checkpoint if args.do_train: model_Ts = [] for teacher in teachers_and_student['teachers']: model_type_T = teacher['model_type'] model_config_T = teacher['config'] checkpoint_T = teacher['checkpoint'] _, _, model_class_T = MODEL_CLASSES[model_type_T] model_T = model_class_T(model_config_T, num_labels=num_labels) state_dict_T = torch.load(checkpoint_T, map_location='cpu') missing_keys, un_keys = model_T.load_state_dict(state_dict_T, strict=True) logger.info(f"Teacher Model {model_type_T} loaded") model_T.to(device) model_T.eval() model_Ts.append(model_T) student = teachers_and_student['student'] model_type_S = student['model_type'] model_config_S = student['config'] checkpoint_S = student['checkpoint'] _, _, model_class_S = MODEL_CLASSES[model_type_S] model_S = model_class_S(model_config_S, num_labels=num_labels) if checkpoint_S is not None: state_dict_S = torch.load(checkpoint_S, map_location='cpu') missing_keys, un_keys = model_S.load_state_dict(state_dict_S, strict=False) logger.info(f"missing keys:{missing_keys}") logger.info(f"unexpected keys:{un_keys}") else: logger.warning("Initializing student randomly") logger.info("Student Model loaded") model_S.to(device) if args.local_rank != -1 or n_gpu > 1: if args.local_rank != -1: raise NotImplementedError elif n_gpu > 1: if args.do_train: model_Ts = [ torch.nn.DataParallel(model_T) for model_T in model_Ts ] model_S = torch.nn.DataParallel(model_S) #,output_device=n_gpu-1) if args.do_train: #parameters params = list(model_S.named_parameters()) all_trainable_params = divide_parameters(params, lr=args.learning_rate) logger.info("Length of all_trainable_params: %d", len(all_trainable_params)) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: raise NotImplementedError train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.forward_batch_size, drop_last=True) num_train_steps = int( len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs) ########## DISTILLATION ########### train_config = TrainingConfig( gradient_accumulation_steps=args.gradient_accumulation_steps, ckpt_frequency=args.ckpt_frequency, log_dir=args.output_dir, output_dir=args.output_dir, fp16=args.fp16, device=args.device) distill_config = DistillationConfig(temperature=args.temperature, kd_loss_type='ce') logger.info(f"{train_config}") logger.info(f"{distill_config}") adaptor_T = BertForGLUESimpleAdaptor adaptor_S = BertForGLUESimpleAdaptor distiller = MultiTeacherDistiller(train_config=train_config, distill_config=distill_config, model_T=model_Ts, model_S=model_S, adaptor_T=adaptor_T, adaptor_S=adaptor_S) optimizer = AdamW(all_trainable_params, lr=args.learning_rate) scheduler_class = get_linear_schedule_with_warmup scheduler_args = { 'num_warmup_steps': int(args.warmup_proportion * num_train_steps), 'num_training_steps': num_train_steps } logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Forward batch size = %d", forward_batch_size) logger.info(" Num backward steps = %d", num_train_steps) callback_func = partial(predict, eval_datasets=eval_datasets, args=args) with distiller: distiller.train(optimizer, scheduler_class=scheduler_class, scheduler_args=scheduler_args, dataloader=train_dataloader, num_epochs=args.num_train_epochs, callback=callback_func, max_grad_norm=1) if not args.do_train and args.do_predict: res = predict(model_S, eval_datasets, step=0, args=args) print(res)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_type", default='bert', type=str, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default='bert-base-uncased', type=str, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default='../output_mc', type=str, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument("--raw_data_dir", default='../data_mc', type=str) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--task_name", default='DREAM') parser.add_argument("--pre_model_dir", default='2020-03-12-10-58-checkpoint-3048') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set') parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=2.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Linear warmup over warmup_steps.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") args = parser.parse_args() args.checkpoint = os.path.join(args.output_dir, args.pre_model_dir) if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: logger.info( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Set seed set_seed(args) args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path) config.output_hidden_states = True config.num_options = int( MULTIPLE_CHOICE_TASKS_NUM_LABELS[args.task_name.lower()]) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) post_model = Post_MV(args, config) post_model.to(args.device) logger.info("Training/evaluation parameters %s", args) if args.fp16: try: import apex apex.amp.register_half_function(torch, 'einsum') except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) if args.do_train: logging.getLogger("transformers.tokenization_utils").setLevel( logging.ERROR) # Reduce logging train_dataset = load_and_cache_examples(args, task=args.task_name, tokenizer=tokenizer, evaluate=False) global_step, tr_loss = train_process(args, train_dataset, post_model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) if args.do_test: logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logging.getLogger("transformers.configuration_utils").setLevel( logging.WARN) # Reduce logging logging.getLogger("transformers.tokenization_utils").setLevel( logging.ERROR) # Reduce logging checkpoint = os.path.join(args.output_dir, args.pre_model_dir + '-TIE') logger.info(" Load model from %s", checkpoint) post_model.load_state_dict( torch.load(os.path.join(checkpoint, 'pytorch_model.bin'))) post_model.to(args.device) task_string = [ '', '-Add1OtherTruth2Opt', '-Add2OtherTruth2Opt', '-Add1PasSent2Opt', '-Add1NER2Pass' ] task_string = [args.task_name + item for item in task_string] result = evaluate(args, task_string, post_model, tokenizer, test=True)
def evaluate(args, eval_task_names, model, tokenizer, test=False): results = [] table = PrettyTable() table.add_column(' ', ['Accuracy']) for eval_task in eval_task_names: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=not test, test=test) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) nb_eval_steps = 0 preds = None out_label_ids = None for batch in eval_dataloader: model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'input_ids_np': batch[3], 'attention_mask_np': batch[4], 'token_type_ids_np': batch[5], 'labels': batch[6] } output = model(**inputs) logits = output[0] nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) preds = np.argmax(preds, axis=1) acc = simple_accuracy(preds, out_label_ids) result = {"task_name": eval_task, "eval_acc": acc} results.append(result) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) table.add_column(eval_task, [round(acc * 100, 2)]) print(table) return results
def main(path_to_train_data, path_to_validation_data): args = Args() df_trn, df_val = dataset.make_dataset(path_to_train_data, path_to_validation_data) if args.should_continue: sorted_checkpoints = _sorted_checkpoints(args) if len(sorted_checkpoints) == 0: raise ValueError( "Used --should_continue but no checkpoint was found in --output_dir." ) else: args.model_name_or_path = sorted_checkpoints[-1] if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir and not args.should_continue): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA, GPU & distributed training device = torch.device("cuda") args.n_gpu = torch.cuda.device_count() args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed utils.set_seed(args) config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir) tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir) model = AutoModelWithLMHead.from_pretrained( args.model_name_or_path, from_tf=False, config=config, cache_dir=args.cache_dir, ) model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = utils.load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False) global_step, tr_loss = train.train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() if args.do_train: # Create output directory if needed os.makedirs(args.output_dir, exist_ok=True) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelWithLMHead.from_pretrained(args.output_dir) tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( "/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = AutoModelWithLMHead.from_pretrained(checkpoint) model.to(args.device) result = evaluate.evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
def main(): args = parse_args() if args.doc_stride >= args.max_seq_length - args.max_query_length: logger.warning( "WARNING - You've set a doc stride which may be superior to the document length in some " "examples. This could result in errors when building features from the examples. Please reduce the doc " "stride or increase the maximum length to ensure the features are correctly built." ) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.warning( "Device: %s, n_gpu: %s", device, args.n_gpu, ) # Set seed set_seed(args) args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, # [A] output attentions output_attentions=True, # [A] output hidden states output_hidden_states=True) tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if args.do_train: # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained( args.output_dir) # , force_download=True) tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval: if args.do_train: logger.info( "Loading checkpoints saved during training for evaluation") checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce model loading logs else: logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path) checkpoints = [args.model_name_or_path] logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained( checkpoint) # , force_download=True) model.to(args.device) # Evaluate result = evaluate(args, model, tokenizer, prefix=global_step) result = dict( (k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) return results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) # [A] bug fix output = [ to_list(output[i]) for output in (outputs[0], outputs[1]) ] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def main(): #解析参数 config.parse() args = config.args for k,v in vars(args).items(): logger.info(f"{k}:{v}") #set seeds torch.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) np.random.seed(args.random_seed) random.seed(args.random_seed) #arguments check device, n_gpu = args_check(args) os.makedirs(args.output_dir, exist_ok=True) forward_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) args.forward_batch_size = forward_batch_size #准备任务 processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] # eg: MNLI,['contradiction', 'entailment', 'neutral'] --> [“矛盾”,“必然”,“中立”] label_list = processor.get_labels() num_labels = len(label_list) # Student的配置 if args.model_architecture == "electra": # 从transformers包中导入ElectraConfig, 并加载配置 bert_config_S = ElectraConfig.from_json_file(args.bert_config_file_S) # (args.output_encoded_layers=='true') --> True, 默认输出隐藏层的状态 bert_config_S.output_hidden_states = (args.output_encoded_layers == 'true') bert_config_S.output_attentions = (args.output_attention_layers=='true') # num_labels;类别个数 bert_config_S.num_labels = num_labels assert args.max_seq_length <= bert_config_S.max_position_embeddings elif args.model_architecture == "albert": # 从transformers包中导入AlbertConfig, 并加载配置 bert_config_S = AlbertConfig.from_json_file(args.bert_config_file_S) # (args.output_encoded_layers=='true') --> True, 默认输出隐藏层的状态 bert_config_S.output_hidden_states = (args.output_encoded_layers == 'true') bert_config_S.output_attentions = (args.output_attention_layers=='true') # num_labels;类别个数 bert_config_S.num_labels = num_labels assert args.max_seq_length <= bert_config_S.max_position_embeddings else: bert_config_S = BertConfig.from_json_file(args.bert_config_file_S) assert args.max_seq_length <= bert_config_S.max_position_embeddings #read data train_dataset = None eval_datasets = None num_train_steps = None # electra和bert都使用的bert的 tokenizer方式 tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) # 加载数据集, 计算steps if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) if args.aux_task_name: aux_train_dataset = load_and_cache_examples(args, args.aux_task_name, tokenizer, evaluate=False, is_aux=True) train_dataset = torch.utils.data.ConcatDataset([train_dataset, aux_train_dataset]) num_train_steps = int(len(train_dataset)/args.train_batch_size) * args.num_train_epochs logger.info("训练数据集已加载") if args.do_predict: eval_datasets = [] eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) for eval_task in eval_task_names: eval_datasets.append(load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)) logger.info("预测数据集已加载") # Student的配置 if args.model_architecture == "electra": #加载模型配置, 只用student模型,其实这里相当于训练教师模型,只训练一个模型 model_S = ElectraSPC(bert_config_S) elif args.model_architecture == "albert": model_S = AlbertSPC(bert_config_S) else: #加载模型配置, 只用student模型,其实这里相当于训练教师模型,只训练一个模型 model_S = BertSPCSimple(bert_config_S, num_labels=num_labels,args=args) #对加载后的student模型的参数进行初始化, 使用student模型预测 if args.load_model_type=='bert' and args.model_architecture not in ["electra", "albert"]: assert args.init_checkpoint_S is not None state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu') if args.only_load_embedding: state_weight = {k[5:]:v for k,v in state_dict_S.items() if k.startswith('bert.embeddings')} missing_keys,_ = model_S.bert.load_state_dict(state_weight,strict=False) logger.info(f"Missing keys {list(missing_keys)}") else: state_weight = {k[5:]:v for k,v in state_dict_S.items() if k.startswith('bert.')} missing_keys,_ = model_S.bert.load_state_dict(state_weight,strict=False) print(f"missing_keys,注意丢失的参数{missing_keys}") logger.info("Model loaded") elif args.load_model_type=='all': assert args.tuned_checkpoint_S is not None state_dict_S = torch.load(args.tuned_checkpoint_S,map_location='cpu') model_S.load_state_dict(state_dict_S) logger.info("Model loaded") elif args.model_architecture in ["electra", "albert"]: assert args.init_checkpoint_S is not None state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu') missing_keys, unexpected_keys = model_S.load_state_dict(state_dict_S,strict=False) logger.info(f"missing keys:{missing_keys}") logger.info(f"unexpected keys:{unexpected_keys}") else: logger.info("Model is randomly initialized.") #模型move to device model_S.to(device) if args.local_rank != -1 or n_gpu > 1: if args.local_rank != -1: raise NotImplementedError elif n_gpu > 1: model_S = torch.nn.DataParallel(model_S) #,output_device=n_gpu-1) if args.do_train: #parameters, params是模型的所有参数组成的列表 params = list(model_S.named_parameters()) all_trainable_params = divide_parameters(params, lr=args.learning_rate) logger.info("要训练的模型参数量组是,包括decay_group和no_decay_group: %d", len(all_trainable_params)) # 优化器设置 optimizer = BERTAdam(all_trainable_params,lr=args.learning_rate, warmup=args.warmup_proportion,t_total=num_train_steps,schedule=args.schedule, s_opt1=args.s_opt1, s_opt2=args.s_opt2, s_opt3=args.s_opt3) logger.info("***** 开始训练 *****") logger.info(" 样本数是 = %d", len(train_dataset)) logger.info(" 前向 batch size = %d", forward_batch_size) logger.info(" 训练的steps = %d", num_train_steps) ########### 训练的配置 ########### train_config = TrainingConfig( gradient_accumulation_steps = args.gradient_accumulation_steps, ckpt_frequency = args.ckpt_frequency, log_dir = args.output_dir, output_dir = args.output_dir, device = args.device) #初始化trainer,执行监督训练,而不是蒸馏。它可以把model_S模型训练成为teacher模型 distiller = BasicTrainer(train_config = train_config, model = model_S, adaptor = BertForGLUESimpleAdaptorTraining) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: raise NotImplementedError #训练的dataloader train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.forward_batch_size,drop_last=True) #执行callbakc函数,对eval数据集 callback_func = partial(predict, eval_datasets=eval_datasets, args=args) with distiller: #开始训练 distiller.train(optimizer, scheduler=None, dataloader=train_dataloader, num_epochs=args.num_train_epochs, callback=callback_func) if not args.do_train and args.do_predict: res = predict(model_S,eval_datasets,step=0,args=args) print(res)
def main(): #parse arguments config.parse() args = config.args for k,v in vars(args).items(): logger.info(f"{k}:{v}") #set seeds torch.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) np.random.seed(args.random_seed) random.seed(args.random_seed) #arguments check device, n_gpu = args_check(args) os.makedirs(args.output_dir, exist_ok=True) forward_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) args.forward_batch_size = forward_batch_size #load bert config bert_config_T = BertConfig.from_json_file(args.bert_config_file_T) bert_config_S = BertConfig.from_json_file(args.bert_config_file_S) assert args.max_seq_length <= bert_config_T.max_position_embeddings assert args.max_seq_length <= bert_config_S.max_position_embeddings #Prepare GLUE task processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) #read data train_dataset = None eval_datasets = None num_train_steps = None tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) if args.aux_task_name: aux_train_dataset = load_and_cache_examples(args, args.aux_task_name, tokenizer, evaluate=False, is_aux=True) train_dataset = torch.utils.data.ConcatDataset([train_dataset, aux_train_dataset]) num_train_steps = int(len(train_dataset)/args.train_batch_size) * args.num_train_epochs if args.do_predict: eval_datasets = [] eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) for eval_task in eval_task_names: eval_datasets.append(load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)) logger.info("Data loaded") #Build Model and load checkpoint model_S = BertForGLUESimple(bert_config_S, num_labels=num_labels,args=args) #Load teacher if args.tuned_checkpoint_Ts: model_Ts = [BertForGLUESimple(bert_config_T, num_labels=num_labels,args=args) for i in range(len(args.tuned_checkpoint_Ts))] for model_T, ckpt_T in zip(model_Ts,args.tuned_checkpoint_Ts): logger.info("Load state dict %s" % ckpt_T) state_dict_T = torch.load(ckpt_T, map_location='cpu') model_T.load_state_dict(state_dict_T) model_T.eval() else: assert args.do_predict is True #Load student if args.load_model_type=='bert': assert args.init_checkpoint_S is not None state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu') state_weight = {k[5:]:v for k,v in state_dict_S.items() if k.startswith('bert.')} missing_keys,_ = model_S.bert.load_state_dict(state_weight,strict=False) assert len(missing_keys)==0 elif args.load_model_type=='all': assert args.tuned_checkpoint_S is not None state_dict_S = torch.load(args.tuned_checkpoint_S,map_location='cpu') model_S.load_state_dict(state_dict_S) else: logger.info("Model is randomly initialized.") if args.do_train: for model_T in model_Ts: model_T.to(device) model_S.to(device) if args.local_rank != -1 or n_gpu > 1: if args.local_rank != -1: raise NotImplementedError elif n_gpu > 1: if args.do_train: model_Ts = [torch.nn.DataParallel(model_T) for model_T in model_Ts] model_S = torch.nn.DataParallel(model_S) #,output_device=n_gpu-1) if args.do_train: #parameters params = list(model_S.named_parameters()) all_trainable_params = divide_parameters(params, lr=args.learning_rate) logger.info("Length of all_trainable_params: %d", len(all_trainable_params)) optimizer = BERTAdam(all_trainable_params,lr=args.learning_rate, warmup=args.warmup_proportion,t_total=num_train_steps,schedule=args.schedule, s_opt1=args.s_opt1, s_opt2=args.s_opt2, s_opt3=args.s_opt3) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Forward batch size = %d", forward_batch_size) logger.info(" Num backward steps = %d", num_train_steps) ########### DISTILLATION ########### train_config = TrainingConfig( gradient_accumulation_steps = args.gradient_accumulation_steps, ckpt_frequency = args.ckpt_frequency, log_dir = args.output_dir, output_dir = args.output_dir, device = args.device) distill_config = DistillationConfig( temperature = args.temperature, kd_loss_type = 'ce') logger.info(f"{train_config}") logger.info(f"{distill_config}") adaptor = partial(BertForGLUESimpleAdaptor, no_logits=False, no_mask = False) distiller = MultiTeacherDistiller(train_config = train_config, distill_config = distill_config, model_T = model_Ts, model_S = model_S, adaptor_T=adaptor, adaptor_S=adaptor) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: raise NotImplementedError train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.forward_batch_size,drop_last=True) callback_func = partial(predict, eval_datasets=eval_datasets, args=args) with distiller: distiller.train(optimizer, scheduler=None, dataloader=train_dataloader, num_epochs=args.num_train_epochs, callback=callback_func) if not args.do_train and args.do_predict: res = predict(model_S,eval_datasets,step=0,args=args) print (res)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default="./data", type=str, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default="bert", type=str, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default="./roberta-base/roberta-base-pytorch_model.bin", type=str, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--task_name", default="multi_task", type=str, help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) parser.add_argument( "--output_dir", default='./check_points', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="./roberta-base/roberta-base-config.json", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="./roberta-base/roberta-base-vocab.json", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=True, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", default=False, action='store_true', help="Whether to run test on the test set.") parser.add_argument( "--evaluate_during_training", default=True, action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=16, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=16, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-6, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=2975, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=2975, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--mPos", default=1.5, type=float) parser.add_argument("--mNeg", default=1.5, type=float) parser.add_argument("--gamma", default=0.05, type=float) parser.add_argument("--all_task", default=True) parser.add_argument("--aggression_attack_task", default=False) parser.add_argument("--aggression_toxicity_task", default=False) parser.add_argument("--attack_toxicity_task", default=False) args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda:0" if torch.cuda.is_available() and not args.no_cuda else "cpu") #args.n_gpu = torch.cuda.device_count() args.n_gpu = 1 else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda:0", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] ''' bert_config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) bert_model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=bert_config) tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.bert_vocab, do_lower_case=args.do_lower_case) ''' if args.model_type == "bert": args.config_name = "./bert-base-uncased/bert-base-uncased-config.json" args.tokenizer_name = "./bert-base-uncased/bert-base-uncased-vocab.txt" args.model_name_or_path = "./bert-base-uncased/bert-base-uncased-pytorch_model.bin" elif args.model_type == "xlnet": args.config_name = "./xlnet-base-cased/xlnet-base-cased-config.json" args.tokenizer_name = "./xlnet-base-cased/xlnet-base-cased-spiece.model" args.model_name_or_path = "./xlnet-base-cased/xlnet-base-cased-pytorch_model.bin" elif args.model_type == "roberta": args.config_name = "./roberta-base/roberta-base-config.json" args.tokenizer_name = "./roberta-base/roberta-base-vocab.json" args.model_name_or_path = "./roberta-base/roberta-base-pytorch_model.bin" config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.bert_vocab) model = Multi_Model(args, config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # test results = {} if args.do_test and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name, do_lower_case=args.do_lower_case) logger.info("test the model") checkpoint = './check_points/checkpoint-200/model.pt' prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" model.load_state_dict(torch.load(checkpoint)) model.to(args.device) aggression_results, attack_results, toxicity_results = evaluate( args, model, tokenizer, prefix=prefix)
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = (args.task_name, ) eval_outputs_dirs = (args.output_dir, ) aggression_results = {} attack_results = {} toxicity_results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) if args.model_type in ["bert", "xlnet"]: aggression_tensor = torch.tensor(tokenizer.encode("aggression"), dtype=torch.long).to(args.device) attack_tensor = torch.tensor(tokenizer.encode("attack"), dtype=torch.long).to(args.device) toxicity_tensor = torch.tensor(tokenizer.encode("toxicity"), dtype=torch.long).to(args.device) elif args.model_type == "roberta": aggression_tensor = torch.tensor([0], dtype=torch.long).to(args.device) attack_tensor = torch.tensor([1], dtype=torch.long).to(args.device) toxicity_tensor = torch.tensor([2], dtype=torch.long).to(args.device) char_vocab = get_char_vocab() aggression_char_ids = char2ids("aggression", char_vocab) attack_char_ids = char2ids("attack", char_vocab) toxicity_char_ids = char2ids("toxicity", char_vocab) aggression_char_tenor = torch.tensor(aggression_char_ids, dtype=torch.long).to(args.device) attack_char_tenor = torch.tensor(attack_char_ids, dtype=torch.long).to(args.device) toxicity_char_tenor = torch.tensor(toxicity_char_ids, dtype=torch.long).to(args.device) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 aggression_preds = None attack_preds = None toxicity_preds = None aggression_out_label_ids = None attack_out_label_ids = None toxicity_out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'aggression_labels': batch[3], 'attack_labels': batch[4], 'toxicity_labels': batch[5], 'aggression_tensor': aggression_tensor, 'attack_tensor': attack_tensor, 'toxicity_tensor': toxicity_tensor, 'aggression_char_tensor': aggression_char_tenor, 'attack_char_tensor': attack_char_tenor, 'toxicity_char_tensor': toxicity_char_tenor } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids if args.all_task: aggression_logits, attack_logits, toxicity_logits, tmp_eval_loss, _, _, _ = model( **inputs) if args.aggression_attack_task: aggression_logits, attack_logits, tmp_eval_loss = model( **inputs) if args.aggression_toxicity_task: aggression_logits, toxicity_logits, tmp_eval_loss = model( **inputs) if args.attack_toxicity_task: attack_logits, toxicity_logits, tmp_eval_loss = model( **inputs) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if aggression_preds is None and attack_preds is None and toxicity_preds is None: if args.all_task: aggression_preds = aggression_logits.detach().cpu().numpy() aggression_out_label_ids = inputs[ 'aggression_labels'].detach().cpu().numpy() attack_preds = attack_logits.detach().cpu().numpy() attack_out_label_ids = inputs['attack_labels'].detach( ).cpu().numpy() toxicity_preds = toxicity_logits.detach().cpu().numpy() toxicity_out_label_ids = inputs['toxicity_labels'].detach( ).cpu().numpy() elif args.aggression_attack_task: aggression_preds = aggression_logits.detach().cpu().numpy() aggression_out_label_ids = inputs[ 'aggression_labels'].detach().cpu().numpy() attack_preds = attack_logits.detach().cpu().numpy() attack_out_label_ids = inputs['attack_labels'].detach( ).cpu().numpy() elif args.aggression_toxicity_task: aggression_preds = aggression_logits.detach().cpu().numpy() aggression_out_label_ids = inputs[ 'aggression_labels'].detach().cpu().numpy() toxicity_preds = toxicity_logits.detach().cpu().numpy() toxicity_out_label_ids = inputs['toxicity_labels'].detach( ).cpu().numpy() elif args.attack_toxicity_task: attack_preds = attack_logits.detach().cpu().numpy() attack_out_label_ids = inputs['attack_labels'].detach( ).cpu().numpy() toxicity_preds = toxicity_logits.detach().cpu().numpy() toxicity_out_label_ids = inputs['toxicity_labels'].detach( ).cpu().numpy() else: if args.all_task: aggression_preds = np.append( aggression_preds, aggression_logits.detach().cpu().numpy(), axis=0) aggression_out_label_ids = np.append( aggression_out_label_ids, inputs['aggression_labels'].detach().cpu().numpy(), axis=0) attack_preds = np.append( attack_preds, attack_logits.detach().cpu().numpy(), axis=0) attack_out_label_ids = np.append( attack_out_label_ids, inputs['attack_labels'].detach().cpu().numpy(), axis=0) toxicity_preds = np.append( toxicity_preds, toxicity_logits.detach().cpu().numpy(), axis=0) toxicity_out_label_ids = np.append( toxicity_out_label_ids, inputs['toxicity_labels'].detach().cpu().numpy(), axis=0) elif args.aggression_attack_task: aggression_preds = np.append( aggression_preds, aggression_logits.detach().cpu().numpy(), axis=0) aggression_out_label_ids = np.append( aggression_out_label_ids, inputs['aggression_labels'].detach().cpu().numpy(), axis=0) attack_preds = np.append( attack_preds, attack_logits.detach().cpu().numpy(), axis=0) attack_out_label_ids = np.append( attack_out_label_ids, inputs['attack_labels'].detach().cpu().numpy(), axis=0) elif args.aggression_toxicity_task: aggression_preds = np.append( aggression_preds, aggression_logits.detach().cpu().numpy(), axis=0) aggression_out_label_ids = np.append( aggression_out_label_ids, inputs['aggression_labels'].detach().cpu().numpy(), axis=0) toxicity_preds = np.append( toxicity_preds, toxicity_logits.detach().cpu().numpy(), axis=0) toxicity_out_label_ids = np.append( toxicity_out_label_ids, inputs['toxicity_labels'].detach().cpu().numpy(), axis=0) elif args.attack_toxicity_task: attack_preds = np.append( attack_preds, attack_logits.detach().cpu().numpy(), axis=0) attack_out_label_ids = np.append( attack_out_label_ids, inputs['attack_labels'].detach().cpu().numpy(), axis=0) toxicity_preds = np.append( toxicity_preds, toxicity_logits.detach().cpu().numpy(), axis=0) toxicity_out_label_ids = np.append( toxicity_out_label_ids, inputs['toxicity_labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.all_task: aggression_preds = np.argmax(aggression_preds, axis=1) attack_preds = np.argmax(attack_preds, axis=1) toxicity_preds = np.argmax(toxicity_preds, axis=1) aggression_result = compute_metrics(eval_task, aggression_preds, aggression_out_label_ids) aggression_results.update(aggression_result) attack_result = compute_metrics(eval_task, attack_preds, attack_out_label_ids) attack_results.update(attack_result) toxicity_result = compute_metrics(eval_task, toxicity_preds, toxicity_out_label_ids) toxicity_results.update(toxicity_result) logger.info( "***** Eval aggression results {} *****".format(prefix)) for key in sorted(aggression_result.keys()): logger.info(" %s = %s", key, str(aggression_result[key])) logger.info("***** Eval attack results {} *****".format(prefix)) for key in sorted(attack_result.keys()): logger.info(" %s = %s", key, str(attack_result[key])) logger.info("***** Eval toxicity results {} *****".format(prefix)) for key in sorted(toxicity_result.keys()): logger.info(" %s = %s", key, str(toxicity_result[key])) elif args.aggression_attack_task: aggression_preds = np.argmax(aggression_preds, axis=1) attack_preds = np.argmax(attack_preds, axis=1) aggression_result = compute_metrics(eval_task, aggression_preds, aggression_out_label_ids) aggression_results.update(aggression_result) attack_result = compute_metrics(eval_task, attack_preds, attack_out_label_ids) attack_results.update(attack_result) logger.info( "***** Eval aggression results {} *****".format(prefix)) for key in sorted(aggression_result.keys()): logger.info(" %s = %s", key, str(aggression_result[key])) logger.info("***** Eval attack results {} *****".format(prefix)) for key in sorted(attack_result.keys()): logger.info(" %s = %s", key, str(attack_result[key])) elif args.aggression_toxicity_task: aggression_preds = np.argmax(aggression_preds, axis=1) toxicity_preds = np.argmax(toxicity_preds, axis=1) aggression_result = compute_metrics(eval_task, aggression_preds, aggression_out_label_ids) aggression_results.update(aggression_result) toxicity_result = compute_metrics(eval_task, toxicity_preds, toxicity_out_label_ids) toxicity_results.update(toxicity_result) logger.info( "***** Eval aggression results {} *****".format(prefix)) for key in sorted(aggression_result.keys()): logger.info(" %s = %s", key, str(aggression_result[key])) logger.info("***** Eval toxicity results {} *****".format(prefix)) for key in sorted(toxicity_result.keys()): logger.info(" %s = %s", key, str(toxicity_result[key])) elif args.attack_toxicity_task: attack_preds = np.argmax(attack_preds, axis=1) toxicity_preds = np.argmax(toxicity_preds, axis=1) attack_result = compute_metrics(eval_task, attack_preds, attack_out_label_ids) attack_results.update(attack_result) toxicity_result = compute_metrics(eval_task, toxicity_preds, toxicity_out_label_ids) toxicity_results.update(toxicity_result) logger.info("***** Eval attack results {} *****".format(prefix)) for key in sorted(attack_result.keys()): logger.info(" %s = %s", key, str(attack_result[key])) logger.info("***** Eval toxicity results {} *****".format(prefix)) for key in sorted(toxicity_result.keys()): logger.info(" %s = %s", key, str(toxicity_result[key])) if args.all_task: return aggression_results, attack_results, toxicity_results elif args.aggression_attack_task: return aggression_results, attack_results elif args.aggression_toxicity_task: return aggression_results, toxicity_results elif args.attack_toxicity_task: return attack_results, toxicity_results
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--train_language", default=None, type=str, required=True) parser.add_argument("--model_type", type=str, required=True) ## Other parameters parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded." ) parser.add_argument("--num_labels", default=2, type=int) parser.add_argument("--overwrite_cache", action='store_true') parser.add_argument("--config_name", default=None, type=str) parser.add_argument("--tokenizer_name", default=None, type=str) parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--do_test", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument('--from_tf', action='store_true', help='whether load tensorflow weights') parser.add_argument("--do_eval_train", action='store_true', help="Whether to run eval on the train set.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--local_rank", default=-1, type=int) parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--seed', type=int, default=1, help="random seed for initialization") args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device # Set seed set_seed(args) args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=args.num_labels) tokenizer = tokenizer_class(tokens_folder=args.model_name_or_path) # tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) logger.info("Training/evaluation parameters %s", args) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Training if args.do_train: # Prepare model model = model_class.from_pretrained(args.model_name_or_path, from_tf=args.from_tf, config=config) fgm = FGM(model) model.to(args.device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max( 1, args.n_gpu) train_dataset = load_and_cache_examples(args, tokenizer, is_training=1) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 min_loss = math.inf model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) output_dir = args.output_dir + "eval_results_{}_{}_{}_{}_{}".format( list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), str(args.learning_rate), str(args.train_batch_size), # str(args.train_language), str(args.train_steps)) try: os.makedirs(output_dir) except: pass output_eval_file = os.path.join(output_dir, 'eval_result.txt') with open(output_eval_file, "w") as writer: writer.write('*' * 80 + '\n') for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, output_ids = batch # 正常训练 outputs = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=output_ids) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() # 对抗训练 fgm.attack() # 在embedding上添加对抗扰动 loss_adv = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=output_ids)[0] if args.n_gpu > 1: loss_adv = loss_adv.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss_adv = loss_adv / args.gradient_accumulation_steps loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm.restore() # 恢复embedding参数 if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: # scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.do_eval and (step + 1) % ( args.eval_steps * args.gradient_accumulation_steps) == 0: if args.do_eval_train: file_list = ['train.csv', 'dev.csv'] else: file_list = ['dev.csv'] for file in file_list: inference_labels = [] gold_labels = [] inference_logits = [] dev_dataset = load_and_cache_examples(args, tokenizer, is_training=2) dev_sampler = SequentialSampler(dev_dataset) dev_dataloader = DataLoader( dev_dataset, sampler=dev_sampler, batch_size=args.eval_batch_size) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(dev_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, output_ids in tqdm( dev_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) output_ids = output_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=output_ids) if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean( ) # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: tmp_eval_loss = tmp_eval_loss / args.gradient_accumulation_steps eval_loss += tmp_eval_loss.item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 model.train() eval_loss = eval_loss / nb_eval_steps result = { 'eval_loss': eval_loss, 'global_step': global_step, 'train_loss': train_loss } if 'dev' in file: with open(output_eval_file, "a") as writer: writer.write(file + '\n') for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_loss < min_loss and 'dev' in file: print("=" * 80) print("Min Loss", eval_loss) print("Saving Model......") min_loss = eval_loss # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) with open(output_eval_file, "a") as writer: writer.write('Min Loss: %f' % min_loss) if args.do_test: if args.do_train == False: output_dir = args.output_dir model = model_class.from_pretrained(output_dir, config=config) model.to(args.device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) dev_dataset = load_and_cache_examples(args, tokenizer, is_training=3) dev_sampler = SequentialSampler(dev_dataset) dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=args.eval_batch_size) logger.info(" *** Run Prediction ***") logger.info(" Num examples = %d", len(dev_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() test_loss = 0 for input_ids, input_mask, segment_ids, output_ids in tqdm( dev_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) output_ids = output_ids.to(device) with torch.no_grad(): tmp_eval_loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=output_ids)[0].detach().cpu().numpy() if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean( ) # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: tmp_eval_loss = tmp_eval_loss / args.gradient_accumulation_steps test_loss += tmp_eval_loss.item() avg_test_loss = round(test_loss / len(dev_dataloader), 4) logger.info('predict loss:{}'.format(str(avg_test_loss)))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_TYPES), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help="The input data dir. Should contain the .json files for the task." + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--train_file", default=None, type=str, help="The input training file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--predict_file", default=None, type=str, help="The input evaluation file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--version_2_with_negative", action="store_true", help="If true, the SQuAD examples contain some that do not have an answer.", ) parser.add_argument( "--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.", ) parser.add_argument( "--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument( "--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.", ) parser.add_argument( "--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step." ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." ) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json output file.", ) parser.add_argument( "--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.", ) parser.add_argument( "--verbose_logging", action="store_true", help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.", ) parser.add_argument( "--lang_id", default=0, type=int, help="language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)", ) parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features") args = parser.parse_args() if args.doc_stride >= args.max_seq_length - args.max_query_length: logger.warning( "WARNING - You've set a doc stride which may be superior to the document length in some " "examples. This could result in errors when building features from the examples. Please reduce the doc " "stride or increase the maximum length to ensure the features are correctly built." ) if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling ) model = AutoModelForQuestionAnswering.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will # remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") # Training if args.do_train: # train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) feature_reader = load_and_cache_examples(args, tokenizer, evaluate=False) train_dataset = HDF5Dataset(feature_reader) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir) # , force_download=True) # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling # So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case, use_fast=False) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: if args.do_train: logger.info("Loading checkpoints saved during training for evaluation") checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) ) else: logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path) checkpoints = [args.model_name_or_path] logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" model = AutoModelForQuestionAnswering.from_pretrained(checkpoint) # , force_download=True) model.to(args.device) # Evaluate result = evaluate(args, model, tokenizer, prefix=global_step) result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) return results
def evaluate(args, model, tokenizer, prefix=""): # dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) feature_reader = load_and_cache_examples(args, tokenizer, evaluate=True) dataset = HDF5Dataset(feature_reader) examples = feature_reader.load_examples() features = feature_reader.get_features() if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr(model.config, "lang2id"): inputs.update( {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} ) outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs.to_tuple()] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results