def train(model): n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}".format(config.device, n_gpu)) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) if n_gpu > 1: model = torch.nn.DataParallel(model) model_it_self = model.module if hasattr(model, 'module') else model global_step = 0 num_train_steps = data_generator.get_num_train_steps() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_steps) dev_loader = data_generator.get_dev_loader() train_loader = data_generator.get_train_loader() for epoch in trange(int(config.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_loader, desc="Iteration")): batch = tuple(t.to(config.device) for t in batch) loss, output = model(batch, global_step, -1) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if config.gradient_accumulation_steps > 1: loss = loss / config.gradient_accumulation_steps # opt.zero_grad() loss.backward() if (step + 1) % config.gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() global_step += 1 #if global_step % config.print_interval == 0: # print_model_result(model_it_self.get_result()) if global_step % config.eval_interval == 0 or global_step == num_train_steps: if config.do_eval: print("\nepoch:{} global:{}\t".format(epoch, global_step)) eval_result = model_eval(model_it_self, dev_loader, data_type='dev') # 保存模型,使用loss为评估标准 save_best_model(model_it_self, eval_result['loss'], data_type='dev') if config.SAVE_USE_ACCURACY: save_best_model(model_it_self, eval_result['accuracy'], data_type='dev', use_accuracy=config.SAVE_USE_ACCURACY) shutil.copy(config.train_best_accuracy_model, os.path.join(config.output_dir, 'best_ac_model.bin')) shutil.copy(config.train_best_loss_model, os.path.join(config.output_dir, 'best_loss_model.bin'))
def main(): parser = argparse.ArgumentParser() BERT_DIR = "./model/uncased_L-12_H-768_A-12/" ## Required parameters parser.add_argument("--bert_config_file", default=BERT_DIR+"bert_config.json", \ type=str, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default=BERT_DIR+"vocab.txt", type=str, \ help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default="out", type=str, \ help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--train_file", type=str, \ help="SQuAD json for training. E.g., train-v1.1.json", \ default="") parser.add_argument("--predict_file", type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", \ default="") parser.add_argument("--init_checkpoint", type=str, help="Initial checkpoint (usually from a pre-trained BERT model).", \ default=BERT_DIR+"pytorch_model.bin") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument( "--max_seq_length", default=300, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=128, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument( "--n_best_size", default=3, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument('--eval_period', type=int, default=2000) parser.add_argument('--max_n_answers', type=int, default=5) parser.add_argument('--merge_query', type=int, default=-1) parser.add_argument('--reduce_layers', type=int, default=-1) parser.add_argument('--reduce_layers_to_tune', type=int, default=-1) parser.add_argument('--only_comp', action="store_true", default=False) parser.add_argument('--train_subqueries_file', type=str, default="") #500 parser.add_argument('--predict_subqueries_file', type=str, default="") #500 parser.add_argument('--prefix', type=str, default="") #500 parser.add_argument('--model', type=str, default="qa") #500 parser.add_argument('--pooling', type=str, default="max") parser.add_argument('--debug', action="store_true", default=False) parser.add_argument('--output_dropout_prob', type=float, default=0) parser.add_argument('--wait_step', type=int, default=30) parser.add_argument('--with_key', action="store_true", default=False) parser.add_argument('--add_noise', action="store_true", default=False) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if not args.predict_file: raise ValueError( "If `do_train` is True, then `predict_file` must be specified." ) if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.do_train and args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): logger.info("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None eval_dataloader, eval_examples, eval_features, _ = get_dataloader( logger=logger, args=args, input_file=args.predict_file, subqueries_file=args.predict_subqueries_file, is_training=False, batch_size=args.predict_batch_size, num_epochs=1, tokenizer=tokenizer) if args.do_train: train_dataloader, train_examples, _, num_train_steps = get_dataloader( logger=logger, args=args, \ input_file=args.train_file, \ subqueries_file=args.train_subqueries_file, \ is_training=True, batch_size=args.train_batch_size, num_epochs=args.num_train_epochs, tokenizer=tokenizer) #a = input() if args.model == 'qa': model = BertForQuestionAnswering(bert_config, 4) metric_name = "F1" elif args.model == 'classifier': if args.reduce_layers != -1: bert_config.num_hidden_layers = args.reduce_layers model = BertClassifier(bert_config, 2, args.pooling) metric_name = "F1" elif args.model == "span-predictor": if args.reduce_layers != -1: bert_config.num_hidden_layers = args.reduce_layers if args.with_key: Model = BertForQuestionAnsweringWithKeyword else: Model = BertForQuestionAnswering model = Model(bert_config, 2) metric_name = "Accuracy" else: raise NotImplementedError() if args.init_checkpoint is not None and args.do_predict and \ len(args.init_checkpoint.split(','))>1: assert args.model == "qa" model = [model] for i, checkpoint in enumerate(args.init_checkpoint.split(',')): if i > 0: model.append(BertForQuestionAnswering(bert_config, 4)) print("Loading from", checkpoint) state_dict = torch.load(checkpoint, map_location='cpu') filter = lambda x: x[7:] if x.startswith('module.') else x state_dict = {filter(k): v for (k, v) in state_dict.items()} model[-1].load_state_dict(state_dict) model[-1].to(device) else: if args.init_checkpoint is not None: print("Loading from", args.init_checkpoint) state_dict = torch.load(args.init_checkpoint, map_location='cpu') if args.reduce_layers != -1: state_dict = {k:v for k, v in state_dict.items() \ if not '.'.join(k.split('.')[:3]) in \ ['encoder.layer.{}'.format(i) for i in range(args.reduce_layers, 12)]} if args.do_predict: filter = lambda x: x[7:] if x.startswith('module.') else x state_dict = {filter(k): v for (k, v) in state_dict.items()} model.load_state_dict(state_dict) else: model.bert.load_state_dict(state_dict) if args.reduce_layers_to_tune != -1: model.bert.embeddings.required_grad = False n_layers = 12 if args.reduce_layers == -1 else args.reduce_layers for i in range(n_layers - args.reduce_layers_to_tune): model.bert.encoder.layer[i].require_grad = False model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 best_f1 = 0 wait_step = 0 model.train() global_step = 0 stop_training = False for epoch in range(int(args.num_train_epochs)): for step, batch in tqdm(enumerate(train_dataloader)): global_step += 1 batch = [t.to(device) for t in batch] loss = model(batch, global_step) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if global_step % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() if global_step % args.eval_period == 0: model.eval() f1 = predict(args, model, eval_dataloader, eval_examples, eval_features, \ device, write_prediction=False) logger.info("%s: %.3f on epoch=%d" % (metric_name, f1 * 100.0, epoch)) if best_f1 < f1: logger.info("Saving model with best %s: %.3f -> %.3f on epoch=%d" % \ (metric_name, best_f1*100.0, f1*100.0, epoch)) model_state_dict = { k: v.cpu() for (k, v) in model.state_dict().items() } torch.save( model_state_dict, os.path.join(args.output_dir, "best-model.pt")) model = model.cuda() best_f1 = f1 wait_step = 0 stop_training = False else: wait_step += 1 if best_f1 > 0.1 and wait_step == args.wait_step: stop_training = True model.train() if stop_training: break elif args.do_predict: if type(model) == list: model = [m.eval() for m in model] else: model.eval() f1 = predict(args, model, eval_dataloader, eval_examples, eval_features, device) logger.info("Final %s score: %.3f%%" % (metric_name, f1 * 100.0))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help= "The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=False, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) model = BertForSequenceClassification(bert_config, len(label_list)) if args.init_checkpoint is not None: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) model = BertForQuestionAnswering(bert_config) if args.init_checkpoint is not None: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) if not args.optimize_on_cpu: model.to(device) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 if args.do_train: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.optimize_on_cpu: model.to('cpu') optimizer.step() # We have accumulated enought gradients model.zero_grad() if args.optimize_on_cpu: model.to(device) global_step += 1 if args.do_predict: eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def _train_labeler(args): if args.data_setup == 'joint': train_gen_list, val_gen_list, crowd_dev_gen, elmo, bert, vocab = get_joint_datasets(args) else: train_fname = args.train_data dev_fname = args.dev_data print(train_fname, dev_fname) data_gens, elmo = get_datasets([(train_fname, 'train', args.goal), (dev_fname, 'dev', args.goal)], args) train_gen_list = [(args.goal, data_gens[0])] val_gen_list = [(args.goal, data_gens[1])] train_log = SummaryWriter(os.path.join(constant.EXP_ROOT, args.model_id, "log", "train")) validation_log = SummaryWriter(os.path.join(constant.EXP_ROOT, args.model_id, "log", "validation")) tensorboard = TensorboardWriter(train_log, validation_log) if args.model_type == 'labeler': print('==> Labeler') model = denoising_models.Labeler(args, constant.ANSWER_NUM_DICT[args.goal]) elif args.model_type == 'filter': print('==> Filter') model = denoising_models.Filter(args, constant.ANSWER_NUM_DICT[args.goal]) else: print('Invalid model type: -model_type ' + args.model_type) raise NotImplementedError model.cuda() total_loss = 0 batch_num = 0 best_macro_f1 = 0. start_time = time.time() init_time = time.time() if args.bert: if args.bert_param_path: print('==> Loading BERT from ' + args.bert_param_path) model.bert.load_state_dict(torch.load(args.bert_param_path, map_location='cpu')) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.bert_learning_rate, warmup=args.bert_warmup_proportion, t_total=-1) # TODO: else: optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) #optimizer = optim.SGD(model.parameters(), lr=1., momentum=0.) if args.load: load_model(args.reload_model_name, constant.EXP_ROOT, args.model_id, model, optimizer) for idx, m in enumerate(model.modules()): logging.info(str(idx) + '->' + str(m)) while True: batch_num += 1 # single batch composed of all train signal passed by. for (type_name, data_gen) in train_gen_list: try: batch = next(data_gen) batch, _ = to_torch(batch) except StopIteration: logging.info(type_name + " finished at " + str(batch_num)) print('Done!') torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, '{0:s}/{1:s}.pt'.format(constant.EXP_ROOT, args.model_id)) return optimizer.zero_grad() loss, output_logits, cls_logits = model(batch, type_name) loss.backward() total_loss += loss.item() optimizer.step() if batch_num % args.log_period == 0 and batch_num > 0: gc.collect() cur_loss = float(1.0 * loss.clone().item()) elapsed = time.time() - start_time train_loss_str = ('|loss {0:3f} | at {1:d}step | @ {2:.2f} ms/batch'.format(cur_loss, batch_num, elapsed * 1000 / args.log_period)) start_time = time.time() print(train_loss_str) logging.info(train_loss_str) tensorboard.add_train_scalar('train_loss_' + type_name, cur_loss, batch_num) if batch_num % args.eval_period == 0 and batch_num > 0: output_index = get_output_index(output_logits, threshold=args.threshold) gold_pred_train = get_gold_pred_str(output_index, batch['y'].data.cpu().clone(), args.goal) print(gold_pred_train[:10]) accuracy = sum([set(y) == set(yp) for y, yp in gold_pred_train]) * 1.0 / len(gold_pred_train) train_acc_str = '{1:s} Train accuracy: {0:.1f}%'.format(accuracy * 100, type_name) if cls_logits is not None: cls_accuracy = sum([(1. if pred > 0. else 0.) == gold for pred, gold in zip(cls_logits, batch['y_cls'].data.cpu().numpy())]) / float(cls_logits.size()[0]) cls_tp = sum([(1. if pred > 0. else 0.) == 1. and gold == 1. for pred, gold in zip(cls_logits, batch['y_cls'].data.cpu().numpy())]) cls_precision = cls_tp / float(sum([1. if pred > 0. else 0. for pred in cls_logits])) cls_recall = cls_tp / float(sum(batch['y_cls'].data.cpu().numpy())) cls_f1 = f1(cls_precision, cls_recall) train_cls_acc_str = '{1:s} Train cls accuracy: {0:.2f}% P: {2:.3f} R: {3:.3f} F1: {4:.3f}'.format(cls_accuracy * 100, type_name, cls_precision, cls_recall, cls_f1) print(train_acc_str) if cls_logits is not None: print(train_cls_acc_str) logging.info(train_acc_str) tensorboard.add_train_scalar('train_acc_' + type_name, accuracy, batch_num) if args.goal != 'onto': for (val_type_name, val_data_gen) in val_gen_list: if val_type_name == type_name: eval_batch, _ = to_torch(next(val_data_gen)) evaluate_batch(batch_num, eval_batch, model, tensorboard, val_type_name, args, args.goal) if batch_num % args.eval_period == 0 and batch_num > 0 and args.data_setup == 'joint': # Evaluate Loss on the Turk Dev dataset. print('---- eval at step {0:d} ---'.format(batch_num)) crowd_eval_loss, macro_f1 = evaluate_data(batch_num, 'crowd/dev_tree.json', model, tensorboard, "open", args, elmo, bert, vocab=vocab) if best_macro_f1 < macro_f1: best_macro_f1 = macro_f1 save_fname = '{0:s}/{1:s}_best.pt'.format(constant.EXP_ROOT, args.model_id) torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, save_fname) print( 'Total {0:.2f} minutes have passed, saving at {1:s} '.format((time.time() - init_time) / 60, save_fname)) if batch_num % args.eval_period == 0 and batch_num > 0 and args.goal == 'onto': # Evaluate Loss on the Turk Dev dataset. print('---- OntoNotes: eval at step {0:d} ---'.format(batch_num)) crowd_eval_loss, macro_f1 = evaluate_data(batch_num, args.dev_data, model, tensorboard, args.goal, args, elmo) if batch_num % args.save_period == 0 and batch_num > 0: save_fname = '{0:s}/{1:s}_{2:d}.pt'.format(constant.EXP_ROOT, args.model_id, batch_num) torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, save_fname) print( 'Total {0:.2f} minutes have passed, saving at {1:s} '.format((time.time() - init_time) / 60, save_fname)) # Training finished! torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, '{0:s}/{1:s}.pt'.format(constant.EXP_ROOT, args.model_id))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--encoder_learning_rate", default=5e-5, type=float, help="The initial learning rate for Bert encoder.") parser.add_argument( "--decoder_learning_rate", default=5e-3, type=float, help="The initial learning rate for generative decoder.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) parser.add_argument('--reduce_dim', default=0, type=int, help='reduce the dimention of decoder.') parser.add_argument('--decoder_layer', default=2, type=int, help='the layers of self-attentive-decoder.') parser.add_argument('--load_trained_model', default=False, action='store_true', help='reduce the dimention of decoder.') parser.add_argument('--postfix', default='', help='the name of log file to be saved.') parser.add_argument('--share_input_output_embed', default=False, action='store_true', help='share input and output embeddings of decoder.') args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}" .format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) bert_config = BertConfig.from_json_file(args.bert_config_file) bert_config.reduce_dim = args.reduce_dim bert_config.decoder_layer = args.decoder_layer bert_config.share_embedd = args.share_input_output_embed if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) idx_to_vocab = load_idx_to_token(args.vocab_file) train_examples = None num_train_steps = None if args.do_train: print('Preprocess the train dataset.') train_examples = read_msmarco_examples(args, input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, p_max_length=P_MAX_LENGTH, q_max_length=Q_MAX_LENGTH, a_max_length=A_MAX_LENGTH) train_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) train_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) train_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) train_answer_ids = torch.tensor([f.answer_ids for f in train_features], dtype=torch.long) train_answer_mask = torch.tensor( [f.answer_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(train_input_ids, train_input_mask, train_segment_ids, train_answer_ids, train_answer_mask) print('Preprocess the dev dataset.') eval_examples = read_msmarco_examples(args, input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, p_max_length=P_MAX_LENGTH, q_max_length=Q_MAX_LENGTH, a_max_length=A_MAX_LENGTH) eval_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) eval_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) eval_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) eval_answer_ids = torch.tensor([f.answer_ids for f in eval_features], dtype=torch.long) eval_answer_mask = torch.tensor([f.answer_mask for f in eval_features], dtype=torch.long) # eval_example_id = torch.tensor([int(f.example_id) for f in eval_features], dtype=torch.long) eval_example_id = torch.tensor([int(0) for f in eval_features], dtype=torch.long) eval_data = TensorDataset(eval_input_ids, eval_input_mask, eval_segment_ids, eval_answer_ids, eval_answer_mask, eval_example_id) # Prepare model # model = BertForQuestionAnswering(bert_config) model = BertWithMultiPointer(bert_config) # print('state_dict is\n', [k for k in model.state_dict().keys()]) # exit(0) if args.init_checkpoint is not None: print('Loading the pre-trained BERT parameters') state_dict = torch.load(args.init_checkpoint, map_location='cpu') model.bert.load_state_dict(state_dict) decoder_embedding = [ "word_embeddings.weight", "position_embeddings.weight", "token_type_embeddings.weight", "LayerNorm.gamma", "LayerNorm.beta" ] decoder_embedding = ['embeddings.' + n for n in decoder_embedding] new_state_dict = OrderedDict() for k, v in state_dict.items(): if k in decoder_embedding: name = k[11:] new_state_dict[name] = v model.decoderEmbedding.load_state_dict(new_state_dict) if args.load_trained_model: print('Loading the pre-trained Model') model_path = args.output_dir + '/models/best_params.pt.' + args.postfix state_dict = torch.load(model_path) new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove module. new_state_dict[name] = v model.load_state_dict(new_state_dict) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # model = DataParallelModel(model) # criterion = DataParallelCriterion(torch.nn.NLLLoss(ignore_index=0)) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and ( 'bert' in n or 'decoderEmbedding' in n) ], 'weight_decay_rate': 0.01, 'lr': args.encoder_learning_rate }, { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and ( 'bert' not in n and 'decoderEmbedding' not in n) ], 'weight_decay_rate': 0.01, 'lr': args.decoder_learning_rate }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) and ( 'bert' in n or 'decoderEmbedding' in n) ], 'weight_decay_rate': 0.0, 'lr': args.encoder_learning_rate }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) and ( 'bert' not in n and 'decoderEmbedding' not in n) ], 'weight_decay_rate': 0.0, 'lr': args.decoder_learning_rate }, ] optimizer = BERTAdam(optimizer_grouped_parameters, lr=args.encoder_learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: eval_loss, eval_ppl = eval_the_model(args, model, eval_data) best_ppl = eval_ppl log_path = os.path.join(args.output_dir, 'log.txt.' + args.postfix) with open(log_path, 'w') as f: f.write('Before train, the average loss on val set is: ' + str(eval_loss.float()) + ' and the average ppl on val set is: ' + str(eval_ppl)) f.write('\n\n') model.train() if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for epoch in trange(int(args.num_train_epochs), desc="Epoch"): train_loss = 0 train_batch = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, answer_ids, answer_mask = batch loss, _ = model(input_ids, segment_ids, input_mask, answer_ids=answer_ids, answer_mask=answer_mask) # pred = [] # target = [] # for item in model_ret: # pred.append(item[0][0].log()) # target.append(item[0][1]) # target = torch.cat(target) # loss = criterion(pred, target) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 train_loss += loss.detach().cpu() train_batch += 1 train_loss /= train_batch train_ppl = math.pow(math.e, train_loss) eval_loss, eval_ppl = eval_the_model(args, model, eval_data) with open(log_path, 'a') as f: f.write('In epoch-' + str(epoch) + ' the average loss on train set is: ' + str(train_loss.float()) + ' the average ppl on train set is: ' + str(train_ppl)) f.write('\n') f.write('In epoch-' + str(epoch) + ' the average loss on eval set is: ' + str(eval_loss.float()) + ' the average ppl on eval set is: ' + str(eval_ppl)) f.write('\n\n') if eval_ppl < best_ppl: model_save_path = os.path.join( args.output_dir, 'models', 'best_params.pt.' + args.postfix) if os.path.exists(model_save_path): os.remove(model_save_path) torch.save(model.state_dict(), model_save_path) if args.do_predict: if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() logger.info("Start evaluating") to_save = [] for input_ids, input_mask, segment_ids, answer_ids, answer_mask, example_id in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) answer_ids = answer_ids.to(device) with torch.no_grad(): def decode_to_vocab(batch, isContext=False): with torch.cuda.device_of(batch): batch = batch.tolist() batch = [[idx_to_vocab[ind] for ind in ex] for ex in batch] def trim(s, t): sentence = [] for w in s: if w == t: break sentence.append(w) return sentence batch = [trim(ex, '[SEP]') for ex in batch] def filter_special(tok): return tok not in [ '[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '[EOS]' ] batch = [filter(filter_special, ex) for ex in batch] return [' '.join(ex) if ex != None else '' for ex in batch] model_ret = model(input_ids, segment_ids, input_mask, answer_ids=answer_ids) outs = model_ret[1] if type(outs) == tuple: outs, answer_scores = outs outs = outs.data answer_scores = answer_scores.tolist() ground_trurhs = decode_to_vocab(answer_ids) decode_answers = decode_to_vocab(outs) # decode_contexts = decode_to_vocab(input_ids, isContext=True) for i, (answer, ground_trurh, answer_score, ex_id) in enumerate( zip(decode_answers, ground_trurhs, answer_scores, example_id)): answer_length = len(answer.split(' ')) to_save.append('\t'.join([ ground_trurh.replace(" ##", ""), answer.replace(" ##", ""), str(1. * answer_score / answer_length), str(ex_id), '\n' ])) output_answer_file = os.path.join(args.output_dir, 'predictions', 'predictions.txt.' + args.postfix) with open(output_answer_file, "w") as f: f.writelines(to_save)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help= "The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--load_all", default=False, action='store_true', help="Whether to load all parameter or only for bert part.") parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=False, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--multi", default=False, help="Whether to add adapter modules", action='store_true') parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument( "--optim", default='normal', help= "Whether to split up the optimiser between adapters and not adapters.") parser.add_argument( "--sample", default='rr', help="How to sample tasks, other options 'prop', 'sqrt' or 'anneal'") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--h_aug", default="n/a", help="Size of hidden state for adapters..") parser.add_argument("--tasks", default="all", help="Which set of tasks to train on.") parser.add_argument( "--task_id", default=1, help="ID of single task to train on if using that setting.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--freeze", default=False, action='store_true', help="Freeze base network weights") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "rte": RTEProcessor, "sts": STSProcessor, "sst": SSTProcessor, "qqp": QQPProcessor, "qnli": QNLIProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) if args.tasks == 'inf': task_names = ['qnli', 'mnli', 'rte'] data_dirs = ['QNLI', 'MNLI', 'RTE'] elif args.tasks == 'all': task_names = [ 'cola', 'mrpc', 'mnli', 'rte', 'sts', 'sst', 'qqp', 'qnli' ] data_dirs = [ 'CoLA', 'MRPC', 'MNLI', 'RTE', 'STS-B', 'SST-2', 'QQP', 'QNLI' ] elif args.tasks == 'single': task_names = [ 'cola', 'mrpc', 'mnli', 'rte', 'sts', 'sst', 'qqp', 'qnli' ] data_dirs = [ 'CoLA', 'MRPC', 'MNLI', 'RTE', 'STS-B', 'SST-2', 'QQP', 'QNLI' ] task_names = [task_names[int(args.task_id)]] data_dirs = [data_dirs[int(args.task_id)]] if task_names[0] not in processors: raise ValueError("Task not found: %s" % (task_name)) processor_list = [processors[task_name]() for task_name in task_names] label_list = [processor.get_labels() for processor in processor_list] tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None num_tasks = len(task_names) if args.do_train: train_examples = [ processor.get_train_examples(args.data_dir + data_dir) for processor, data_dir in zip(processor_list, data_dirs) ] num_train_steps = int( len(train_examples[0]) / args.train_batch_size * args.num_train_epochs) if args.tasks == 'all': total_tr = 300 * num_tasks * args.num_train_epochs else: total_tr = int(0.5 * num_train_steps) if args.tasks == 'all': steps_per_epoch = args.gradient_accumulation_steps * 300 * num_tasks else: steps_per_epoch = int(num_train_steps / (2. * args.num_train_epochs)) bert_config.num_tasks = num_tasks if args.h_aug is not 'n/a': bert_config.hidden_size_aug = int(args.h_aug) model = BertForMultiTask(bert_config, [len(labels) for labels in label_list]) if args.init_checkpoint is not None: # load all parameter including the classification and model patch if args.load_all: model.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) elif args.multi: partial = torch.load(args.init_checkpoint, map_location='cpu') model_dict = model.bert.state_dict() update = {} for n, p in model_dict.items(): if 'aug' in n or 'mult' in n: update[n] = p if 'pooler.mult' in n and 'bias' in n: update[n] = partial['pooler.dense.bias'] if 'pooler.mult' in n and 'weight' in n: update[n] = partial['pooler.dense.weight'] else: update[n] = partial[n] model.bert.load_state_dict(update) else: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) if args.freeze: for n, p in model.bert.named_parameters(): if 'aug' in n or 'classifier' in n or 'mult' in n or 'gamma' in n or 'beta' in n: continue p.requires_grad = False model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.optim == 'normal': no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=total_tr) else: no_decay = ['bias', 'gamma', 'beta'] base = ['attn'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in base) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in base) ], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=total_tr) optimizer_parameters_mult = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in base) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in base) ], 'weight_decay_rate': 0.0 }] optimizer_mult = BERTAdam(optimizer_parameters_mult, lr=3e-4, warmup=args.warmup_proportion, t_total=total_tr) if args.do_eval: eval_loaders = [] for i, task in enumerate(task_names): eval_examples = processor_list[i].get_dev_examples(args.data_dir + data_dirs[i]) eval_features = convert_examples_to_features( eval_examples, label_list[i], args.max_seq_length, tokenizer, task) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) if task != 'sts': all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) else: all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.float32) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_loaders.append( DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)) global_step = 0 if args.do_train: loaders = [] logger.info(" Num Tasks = %d", len(train_examples)) for i, task in enumerate(task_names): train_features = convert_examples_to_features( train_examples[i], label_list[i], args.max_seq_length, tokenizer, task) logger.info("***** training data for %s *****", task) logger.info(" Data size = %d", len(train_features)) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) if task != 'sts': all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.long) else: all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.float32) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) loaders.append( iter( DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size))) total_params = sum(p.numel() for p in model.parameters()) logger.info(" Num param = {}".format(total_params)) loaders = [cycle(it) for it in loaders] model.train() best_score = 0. if args.sample == 'sqrt' or args.sample == 'prop': probs = [6680, 2865, 306798, 1945, 4491, 52616, 284257, 84715] if args.sample == 'prop': alpha = 1. if args.sample == 'sqrt': alpha = 0.5 probs = [p**alpha for p in probs] tot = sum(probs) probs = [p / tot for p in probs] task_id = 0 epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): if args.sample == 'anneal': probs = [6680, 2865, 306798, 1945, 4491, 52616, 284257, 84715] alpha = 1. - 0.8 * epoch / (args.num_train_epochs - 1) probs = [p**alpha for p in probs] tot = sum(probs) probs = [p / tot for p in probs] tr_loss = [0. for i in range(num_tasks)] nb_tr_examples, nb_tr_steps = 0, 0 for step in range(steps_per_epoch): if args.sample != 'rr': if step % args.gradient_accumulation_steps == 0: task_id = np.random.choice(8, p=probs) else: task_id = task_id % num_tasks batch = next(loaders[task_id]) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, task_id, task_names[task_id], label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss[task_id] += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if step % 1000 < num_tasks: logger.info("Task: {}, Step: {}".format(task_id, step)) logger.info("Loss: {}".format(tr_loss[task_id] / nb_tr_steps)) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients if args.optim != 'normal': optimizer_mult.step() model.zero_grad() global_step += 1 if not args.sample: task_id += 1 epoch += 1 ev_acc = 0. for i, task in enumerate(task_names): ev_acc += do_eval(model, logger, args, device, tr_loss[i], nb_tr_steps, global_step, processor_list[i], label_list[i], tokenizer, eval_loaders[i], task, i) logger.info("Total acc: {}".format(ev_acc)) if ev_acc > best_score: best_score = ev_acc model_dir = os.path.join(args.output_dir, "best_model.pth") torch.save(model.state_dict(), model_dir) logger.info("Best Total acc: {}".format(best_score)) ev_acc = 0. for i, task in enumerate(task_names): ev_acc += do_eval(model, logger, args, device, tr_loss[i], nb_tr_steps, global_step, processor_list[i], label_list[i], tokenizer, eval_loaders[i], task, i) logger.info("Total acc: {}".format(ev_acc))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default='/hdd/lujunyu/dataset/multi_turn_corpus/ecommerce/', type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_config_file", default=BERT_BASE_DIR + 'bert_config.json', type=str, required=False, help= "The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument("--task_name", default='ECD', type=str, required=False, help="The name of the task to train.") parser.add_argument( "--vocab_file", default=BERT_BASE_DIR + 'vocab.txt', type=str, required=False, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default='/hdd/lujunyu/model/chatbert/ecd_without_pretraining/', type=str, required=False, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--do_lower_case", default=False, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_test", default=True, action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--train_batch_size", default=500, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=200, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=1e-3, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=5.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.0, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=200, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=1, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) bert_config.num_hidden_layers = 2 # bert_config.num_attention_heads = 6 if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): if args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=args.do_lower_case) train_dataset = ECDDatasetForSP(file_path=os.path.join( args.data_dir, "train.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer) eval_dataset = ECDDatasetForSP(file_path=os.path.join( args.data_dir, "dev.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, sampler=RandomSampler(train_dataset), num_workers=4) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=args.eval_batch_size, sampler=SequentialSampler(eval_dataset), num_workers=4) num_train_steps = None if args.do_train: num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) model = BertForSequenceClassification(bert_config, num_labels=2) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # no_decay = ['bias', 'gamma', 'beta'] # optimizer_parameters = [ # {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01}, # {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0} # ] optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters()], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 best_acc = 0.0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if (step + 1) % args.save_checkpoints_steps == 0: ### Evaluate at the end of epoches model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy } output_eval_file = os.path.join(args.output_dir, "eval_results_dev.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) output_eval_file = os.path.join(args.output_dir, "logits_dev.txt") with open(output_eval_file, "w") as f: for i in range(len(logits_all)): for j in range(len(logits_all[i])): f.write(str(logits_all[i][j])) if j == len(logits_all[i]) - 1: f.write("\n") else: f.write(" ") ### Save the best checkpoint if best_acc < eval_accuracy: try: ### Remove 'module' prefix when using DataParallel state_dict = model.module.state_dict() except AttributeError: state_dict = model.state_dict() torch.save(state_dict, os.path.join(args.output_dir, "model.pt")) best_acc = eval_accuracy logger.info('Saving the best model in {}'.format( os.path.join(args.output_dir, "model.pt"))) model.train()
def main(): #rs_writer = SummaryWriter("./log") parser = argparse.ArgumentParser() viz = visdom.Visdom() # Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, choices=[ "semeval_QA_EXPT", "semeval_QA_T", "travel_experience", "semeval_single" ], help="Name of the task to train") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The path of BERT pre-trained vocab file") parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="The path of BERT pre-trained .ckpt file") parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The path of BERT .json file") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory of training result") parser.add_argument("--data_dir", default=None, type=str, required=True, help="The path of training dataset") # Other parameters parser.add_argument("--train_batch_size", default=32, type=int, help="The size of training batch") parser.add_argument("--eval_batch_size", default=8, type=int, help="The size of evaluation batch") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum sentence length of input after WordPiece tonkenization\n" "Greater than the max will be truncated, smaller than the max will be padding" ) parser.add_argument("--local_rank", default=-1, type=int, help="Local_rank for distributed training on gpus") parser.add_argument("--seed", default=42, type=int, help="Random seed for initialization") parser.add_argument( "--accumulate_gradients", default=1, type=int, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument( '--gradient_accumulation_steps', default=1, type=int, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument('--save_steps', type=int, default=100, help="Save checkpoint every X updates steps.") parser.add_argument( '--layers', type=int, nargs='+', default=[-2], help="choose the layers that used for downstream tasks, " "-2 means use pooled output, -1 means all layer," "else means the detail layers. default is -2") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="The number of epochs on training data") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The learning rate of model") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for\n" "0.1 means 10% of training set") parser.add_argument('--layer_learning_rate_decay', type=float, default=0.95) parser.add_argument('--layer_learning_rate', type=float, nargs='+', default=[2e-5] * 12, help="learning rate in each group") parser.add_argument("--do_train", default=False, action="store_true", help="Whether training the data or not") parser.add_argument("--do_eval", default=False, action="store_true", help="Whether evaluating the data or not") parser.add_argument("--do_predict", default=False, action="store_true", help="Whether predicting the data or not") parser.add_argument( "--do_lower_case", default=False, action="store_true", help= "To lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--no_cuda", default=False, action="store_true", help="Whether use the GPU device or not") parser.add_argument("--discr", default=False, action='store_true', help="Whether to do discriminative fine-tuning.") parser.add_argument('--pooling_type', default=None, type=str, choices=[None, 'mean', 'max']) args = parser.parse_args() viz = visdom.Visdom() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) # prepare dataloaders processors = { "semeval_QA_EXPT": Semeval_QA_EXPT_Processor, "semeval_QA_T": Semeval_QA_T_Processor, "travel_experience": Travel_exp_data, "semeval_single": Semeval_single_Processor } processor = processors[args.task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) # training set train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # test set if args.do_eval: test_examples = processor.get_test_examples(args.data_dir) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) test_dataloader = DataLoader(test_data, batch_size=args.eval_batch_size, shuffle=False) # model and optimizer layer version """ model = BertForSequenceClassification(bert_config, len(label_list), args.layers, pooling=args.pooling_type) if args.init_checkpoint is not None: model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] if args.discr: if len(args.layer_learning_rate) > 1: groups = [(f'layer.{i}.', args.layer_learning_rate[i]) for i in range(12)] else: lr = args.layer_learning_rate[0] groups = [(f'layer.{i}.', lr * pow(args.layer_learning_rate_decay, 11 - i)) for i in range(12)] group_all = [f'layer.{i}.' for i in range(12)] no_decay_optimizer_parameters = [] decay_optimizer_parameters = [] for g, l in groups: no_decay_optimizer_parameters.append( { 'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in [g])], 'weight_decay_rate': 0.01, 'lr': l } ) decay_optimizer_parameters.append( { 'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in [g])], 'weight_decay_rate': 0.0, 'lr': l } ) group_all_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], 'weight_decay_rate': 0.0}, ] optimizer_parameters = no_decay_optimizer_parameters + decay_optimizer_parameters + group_all_parameters else: optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) """ model = BertForSequenceClassification(bert_config, len(label_list)) if args.init_checkpoint is not None: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) # train output_log_file = os.path.join(args.output_dir, "log.txt") print("output_log_file=", output_log_file) with open(output_log_file, "w") as writer: if args.do_eval: writer.write( "epoch\tglobal_step\tloss\ttest_loss\ttest_accuracy\n") else: writer.write("epoch\tglobal_step\tloss\n") global_step = 0 epoch = 0 best_epoch, best_accuracy = 0, 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch += 1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) print(loss.item()) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() viz.line([loss.item()], [global_step], win='tr_loss', update='append') nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 # Save the checkpoint model after each N steps if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: save_output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(save_output_dir): os.makedirs(save_output_dir) torch.save( model.state_dict(), os.path.join(save_output_dir, "training_args.bin")) viz.line([optimizer.get_lr()[0]], [global_step - 1], win="lr", update="append") # eval_test if args.do_eval: model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 with open( os.path.join(args.output_dir, "test_ep_" + str(epoch) + ".txt"), "w") as f_test: for input_ids, input_mask, segment_ids, label_ids in tqdm( test_dataloader, desc="Testing"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_test_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = F.softmax(logits, dim=-1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output_i in range(len(outputs)): f_test.write(str(outputs[output_i])) for ou in logits[output_i]: f_test.write(" " + str(ou)) f_test.write("\n") tmp_test_accuracy = np.sum(outputs == label_ids) viz.line([tmp_test_loss.item()], [nb_test_steps], win='eval_loss', update='append') test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples viz.line([test_accuracy], [nb_test_steps - 1], win='test_acc', update='append') if test_accuracy > best_accuracy: best_accuracy = test_accuracy best_epoch = epoch torch.save(model.state_dict(), os.path.join(args.output_dir, "best_model.bin")) result = collections.OrderedDict() if args.do_eval: result = { 'epoch': epoch, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'test_loss': test_loss, 'test_accuracy': test_accuracy } else: result = { 'epoch': epoch, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } logger.info("***** Eval results *****") with open(output_log_file, "a+") as writer: for key in result.keys(): logger.info(" %s = %s\n", key, str(result[key])) writer.write("%s\t" % (str(result[key]))) writer.write("\n") print("The best Epoch is: ", best_epoch) print("The best test_accuracy is: ", best_accuracy)
def main(): args = get_train_arguments() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") num_gpu = torch.cuda.device_count() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if num_gpu > 0: torch.cuda.manual_seed_all(args.seed) bert_config = BertConfig.from_json_file(args.bert_config_file) tokenizer = FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) processor = None if args.task_name == "sentihood_NLI_M": processor = Sentihood_NLI_M_Processor() elif args.task_name == "sentihood_NLI_B": processor = Sentihood_NLI_B_Processor elif args.task_name == "sentihood_single": processor = Sentihood_single_Processor() elif args.task_name == "sentihood_QA_B": processor = Sentihood_QA_B_Processor() elif args.task_name == "sentihood_QA_M": processor = Sentihood_QA_M_Processor() else: raise ValueError("Unimplemented task!") if not os.path.exists(args.output_dir): print('make output directory {}'.format(args.output_dir)) os.makedirs(args.output_dir) labels = processor.get_labels() # training set if not os.path.exists(args.data_dir): raise ValueError("Data does not exist") train_examples = processor.get_train_examples(args.data_dir) divide = args.train_batch_size * args.num_train_epochs num_train_steps = int(len(train_examples) / divide) train_features = get_features( train_examples, labels, args.max_seq_length, tokenizer) label_ids, input_ids, seg_ids, input_mks = get_input_tensor( train_features) train_dataset = TensorDataset(input_ids, input_mks, seg_ids, label_ids) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) # test set if args.eval_test: test_examples = processor.get_test_examples(args.data_dir) test_features = get_features( test_examples, labels, args.max_seq_length, tokenizer) label_ids, input_ids, seg_ids, input_mks = get_input_tensor( test_features) test_data = TensorDataset( input_ids, input_mks, seg_ids, label_ids) test_dataloader = DataLoader( test_data, batch_size=args.eval_batch_size, shuffle=False) # model and optimizer model = BertForSequenceClassification(bert_config, len(labels)) # load the pretrained parameters model.bert.load_state_dict(torch.load( args.init_checkpoint, map_location='cpu')) model.to(device) if num_gpu > 1: model = torch.nn.DataParallel(model) ##########################continue here######################################### with open('{}/log.txt'.format(args.output_dir), "w") as f: title = "epoch global_step loss\ttest_loss test_accuracy" if args.eval_test else "epoch global_step loss " f.write(title) f.write("\n") no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any( nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if any( nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] # train optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 epoch = 0 for ti in trange(int(args.num_train_epochs), desc="Epoch"): nb_tr_examples, nb_tr_steps, tr_loss = 0, 0, 0 epoch += 1 model.train() for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): # batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch input_ids, input_mask, segment_ids, label_ids = input_ids.to( device), input_mask.to(device), segment_ids.to(device), label_ids.to(device) loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if num_gpu > 1: loss = loss.mean() loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 optimizer.step() model.zero_grad() global_step += 1 # eval_test if args.eval_test: model.eval() test_loss, test_accuracy, nb_test_steps, nb_test_examples = 0, 0, 0, 0 fname = "{}/test_ep_{}.txt".format(args.output_dir, epoch) with open(fname, "w") as ftname: for batch in test_dataloader: input_ids, input_mask, segment_ids, label_ids = batch input_ids, input_mask = input_ids.to( device), input_mask.to(device), segment_ids, label_ids = segment_ids.to( device), label_ids.to(device) with torch.no_grad(): tmp_test_loss, logits = model( input_ids, segment_ids, input_mask, label_ids) label_ids = label_ids.to('cpu').numpy() logits = F.softmax(logits, dim=-1).detach().cpu().numpy() outputs = np.argmax(logits, axis=1) for o_i in range(len(outputs)): ftname.write(str(outputs[o_i])) for ou in logits[o_i]: ftname.write(" " + str(ou)) ftname.write("\n") test_accuracy += np.sum(outputs == label_ids) test_loss += tmp_test_loss.mean().item() nb_test_examples += input_ids.size(0) nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples eval_str = "{} {} {} {}\n".format(epoch, global_step, tr_loss / nb_tr_steps, test_loss, test_accuracy) train_str = "{} {} {}\n".format( epoch, global_step, tr_loss / nb_tr_steps) row = eval_str if args.eval_test else train_str with open('{}/log.txt'.format(args.output_dir), "a+") as f: f.write(row)
def main(): bert_path = bp.bert_path ## parameters do_lower_case = False do_train = True do_eval = True data_dir = '.' max_seq_length = 128 train_batch_size = 8 eval_batch_size = 8 learning_rate = 2e-5 num_train_epochs = 8.0 seed = 42 output_dir = 'dream_finetuned' no_cuda = False bert_config_file = bert_path + 'bert_config.json' vocab_file = bert_path + 'vocab.txt' init_checkpoint = bert_path + 'pytorch_model.bin' gradient_accumulation_steps = 3 warmup_proportion = 0.1 local_rank = -1 # 用来指定GPU processors = { "dream": dreamProcessor, } if local_rank == -1 or no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(local_rank != -1)) if gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(gradient_accumulation_steps)) train_batch_size = int(train_batch_size / gradient_accumulation_steps) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if n_gpu > 0: torch.cuda.manual_seed_all(seed) if not do_train and not do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(bert_config_file) if max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(output_dir) and os.listdir(output_dir): if do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(output_dir)) else: os.makedirs(output_dir, exist_ok=True) processor = processors['dream']() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) train_examples = None num_train_steps = None if do_train: train_examples = processor.get_train_examples(data_dir) num_train_steps = int( len(train_examples) / n_class / train_batch_size / gradient_accumulation_steps * num_train_epochs) model = BertForSequenceClassification( bert_config, 1 if n_class > 1 else len(label_list)) if init_checkpoint is not None: model.bert.load_state_dict( torch.load(init_checkpoint, map_location='cpu')) model.to(device) if local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_steps) global_step = 0 if do_train: train_features = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_steps) input_ids = [] input_mask = [] segment_ids = [] label_id = [] for f in train_features: input_ids.append([]) input_mask.append([]) segment_ids.append([]) for i in range(n_class): input_ids[-1].append(f[i].input_ids) input_mask[-1].append(f[i].input_mask) segment_ids[-1].append(f[i].segment_ids) label_id.append([f[0].label_id]) all_input_ids = torch.tensor(input_ids, dtype=torch.long) all_input_mask = torch.tensor(input_mask, dtype=torch.long) all_segment_ids = torch.tensor(segment_ids, dtype=torch.long) all_label_ids = torch.tensor(label_id, dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size) model.train() for _ in trange(int(num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids, n_class) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 torch.save(model.state_dict(), os.path.join(output_dir, "model.pt")) model.load_state_dict(torch.load(os.path.join(output_dir, "model.pt"))) if do_eval: eval_examples = processor.get_dev_examples(data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", eval_batch_size) input_ids = [] input_mask = [] segment_ids = [] label_id = [] for f in eval_features: input_ids.append([]) input_mask.append([]) segment_ids.append([]) for i in range(n_class): input_ids[-1].append(f[i].input_ids) input_mask[-1].append(f[i].input_mask) segment_ids[-1].append(f[i].segment_ids) label_id.append([f[0].label_id]) all_input_ids = torch.tensor(input_ids, dtype=torch.long) all_input_mask = torch.tensor(input_mask, dtype=torch.long) all_segment_ids = torch.tensor(segment_ids, dtype=torch.long) all_label_ids = torch.tensor(label_id, dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, n_class) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples if do_train: result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } else: result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy} output_eval_file = os.path.join(output_dir, "eval_results_dev.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) output_eval_file = os.path.join(output_dir, "logits_dev.txt") with open(output_eval_file, "w") as f: for i in range(len(logits_all)): for j in range(len(logits_all[i])): f.write(str(logits_all[i][j])) if j == len(logits_all[i]) - 1: f.write("\n") else: f.write(" ") eval_examples = processor.get_test_examples(data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", eval_batch_size) input_ids = [] input_mask = [] segment_ids = [] label_id = [] for f in eval_features: input_ids.append([]) input_mask.append([]) segment_ids.append([]) for i in range(n_class): input_ids[-1].append(f[i].input_ids) input_mask[-1].append(f[i].input_mask) segment_ids[-1].append(f[i].segment_ids) label_id.append([f[0].label_id]) all_input_ids = torch.tensor(input_ids, dtype=torch.long) all_input_mask = torch.tensor(input_mask, dtype=torch.long) all_segment_ids = torch.tensor(segment_ids, dtype=torch.long) all_label_ids = torch.tensor(label_id, dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, n_class) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples if do_train: result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } else: result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy} output_eval_file = os.path.join(output_dir, "eval_results_test.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) output_eval_file = os.path.join(output_dir, "logits_test.txt") with open(output_eval_file, "w") as f: for i in range(len(logits_all)): for j in range(len(logits_all[i])): f.write(str(logits_all[i][j])) if j == len(logits_all[i]) - 1: f.write("\n") else: f.write(" ")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=128, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "news": NewsProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info("16-bits training currently not supported in distributed training") args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) label_list = processor.get_labels() print("label_list.size:%d\n" %(len(label_list))) # Prepare model model = BertForSequenceClassification(bert_config, len(label_list)) if args.init_checkpoint is not None: model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) if args.fp16: model.half() model.to(device) #if args.local_rank != -1: #model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], # output_device=args.local_rank) #elif n_gpu > 1: # model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = RandomSampler(train_data) #train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = SequentialSampler(eval_data) #eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help= "The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=False, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", default=False, action='store_true', help="Whether to run prediction on the test set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) args = parser.parse_args() processors = { "dlcompetition": DLCompetitionProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) label_list = processor.get_labels() print("label_list.size:%d\n" % (len(label_list))) # Prepare model model = BertForSequenceClassification(bert_config, len(label_list)) if args.init_checkpoint is not None: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) if args.fp16: model.half() model.to(device) # if args.local_rank != -1: # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], # output_device=args.local_rank) # elif n_gpu > 1: # model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = RandomSampler(train_data) # train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = SequentialSampler(eval_data) # eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test: # prediction test_examples = processor.get_test_examples(args.data_dir) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running prediction *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) if args.local_rank == -1: test_sampler = SequentialSampler(test_data) else: test_sampler = SequentialSampler(test_data) # test_sampler = DistributedSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() answer = [] for input_ids, input_mask, segment_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().tolist() answer.extend(logits) output_test_prediction = os.path.join( args.output_dir, time.strftime("%Y%m%d-%H%M%S") + "submission.csv") with open(output_test_prediction, "w") as writer: logger.info("Writing prediction") writer.write("id,label\n") for i, ans in answer: writer.write("%d,%d\n" % (i, ans))
def main(): os.environ['CUDA_VISIBLE_DEVICES'] = '0' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' root_path = r'./' bert_path = './chinese_L-12_H-768_A-12' flags = tf.flags flags.DEFINE_string("data_dir", os.path.join(root_path, 'data'), "The input datadir.", ) flags.DEFINE_string("bert_config_file", os.path.join(bert_path, 'bert_config.json'), "The config json file corresponding to the pre-trained BERT model.") flags.DEFINE_string("task_name", 'ner', "The name of the task to train.") flags.DEFINE_string("vocab_file", os.path.join(bert_path, 'vocab.txt'), "The vocabulary file that the BERT model was trained on.") flags.DEFINE_string("output_dir", os.path.join(root_path, 'model'), "The output directory where the model checkpoints will be written.") ## Other parameters flags.DEFINE_string("init_checkpoint", os.path.join(bert_path, 'pytorch_model.bin'), "Initial checkpoint (usually from a pre-trained BERT model).") flags.DEFINE_bool("do_lower_case", True, "Whether to lower case the input text.") flags.DEFINE_integer("max_seq_length", 48, "The maximum total input sequence length after WordPiece tokenization.") # flags.DEFINE_boolean('clean', True, 'remove the files which created by last training') flags.DEFINE_bool("do_train", True, "Whether to run training.") flags.DEFINE_bool("do_eval", True, "Whether to run eval on the dev set.") flags.DEFINE_bool("no_cuda", False, "Whether not to use CUDA when available") # flags.DEFINE_bool("do_predict", True, "Whether to run the model in inference mode on the test set.") flags.DEFINE_integer("train_batch_size", 64, "Total batch size for training.") flags.DEFINE_integer("eval_batch_size", 64, "Total batch size for eval.") # flags.DEFINE_integer("predict_batch_size", 4, "Total batch size for predict.") flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") flags.DEFINE_integer("save_model_epoch", 1, "save model ") flags.DEFINE_float("num_train_epochs", 15.0, "Total number of training epochs to perform.") flags.DEFINE_float('droupout_rate', 0.5, 'Dropout rate') flags.DEFINE_float('clip', 5, 'Gradient clip') flags.DEFINE_float("warmup_proportion", 0.1, "Proportion of training to perform linear learning rate warmup for. ""E.g., 0.1 = 10% of training.") flags.DEFINE_integer("save_checkpoints_steps", 50, "How often to save the model checkpoint.") flags.DEFINE_integer("iterations_per_loop", 50, "How many steps to make in each estimator call.") flags.DEFINE_integer("local_rank", -1, "local_rank for distributed training on gpus") flags.DEFINE_integer("seed", 1, "random seed for initialization") flags.DEFINE_integer("gradient_accumulation_steps", 1, "Number of updates steps to accumualte before performing a backward/update pass.") flags.DEFINE_bool("optimize_on_cpu", False, "Whether to perform optimization and keep the optimizer averages on CPU") flags.DEFINE_bool("fp16", False, "Whether to use 16-bit float precision instead of 32-bit") flags.DEFINE_float('loss_scale', 128.0, 'Loss scaling, positive power of 2 values can improve fp16 convergence.') args = flags.FLAGS if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info("16-bits training currently not supported in distributed training") args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() processor = NewsProcessor() tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) if os.path.exists('./pkl/label_list.pkl'): label_list = load_pkl('label_list.pkl') if args.do_train and os.path.exists('./pkl/label_list.pkl') == False: label_list = processor.get_labels() save_pkl('label_list.pkl', label_list) print("label_list.size:%d\n" % (len(label_list))) # Prepare model model = BertForSequenceClassification(bert_config, len(label_list)) if args.init_checkpoint is not None: model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) if args.fp16: model.half() model.to(device) # if args.local_rank != -1: # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], # output_device=args.local_rank) # elif n_gpu > 1: # model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: def get_dev_result(): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_accuracy = eval_accuracy / nb_eval_examples return eval_accuracy,eval_loss train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = RandomSampler(train_data) # train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() epoched = 1 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 acc,loss = get_dev_result() acc = round(acc,4) def write_file(filename, str): """ 写入文件 :param str: 字符串 :return: 无 """ writefile = open("./eval/" + filename, 'a+', encoding='utf-8') writefile.write(str + '\n') writefile.close() write_file('log.txt',str(epoched)+' acc: --> '+str(acc)) write_file('log.txt', str(epoched) + ' loss: --> ' + str(loss)) write_file('log.txt', '===========================') print('dev acc:', acc) print('loss',loss) if int(epoched)>2: if int(epoched) % int(args.save_model_epoch) == 0: # torch.save(model.state_dict(), args.output_dir + '/model_{}.pkl'.format(str(epoched))) torch.save(model, args.output_dir + '/model_{}.pkl'.format(str(epoched) + '_' + str(acc))) if int(epoched) == int(args.num_train_epochs): torch.save(model, args.output_dir + '/model_end.pkl') print('test acc:', acc) print('dev', loss) epoched += 1 if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) if args.do_train == False: # model.load_state_dict(torch.load('./model/model_1.pkl', map_location='cpu')) model = torch.load('./model/model_end.pkl') model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, } output_eval_file = os.path.join("./eval", "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(args): processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "news": NewsProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # bert_config = BertConfig.from_json_file(args.bert_config_file) bert_config = BertConfig(args.vocab_size) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) label_list = processor.get_labels() print("label_list.size:%d\n" % (len(label_list))) # Prepare model model = BertForSequenceClassification(bert_config, len(label_list)) if args.init_checkpoint is not None: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) if args.fp16: model.half() model.to(device) # if args.local_rank != -1: # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], # output_device=args.local_rank) # elif n_gpu > 1: # model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = RandomSampler(train_data) # train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() ### 状态设置 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = SequentialSampler(eval_data) # eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) torch.save(model.state_dict(), args.output_dir + '/bert_' + args.task_name + 'pt')
def main(): parser = argparse.ArgumentParser() BERT_DIR = "uncased_L-12_H-768_A-12/" ## Required parameters parser.add_argument("--bert_config_file", default=BERT_DIR+"bert_config.json", \ type=str, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default=BERT_DIR+"vocab.txt", type=str, \ help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default="out", type=str, \ help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--load", default=False, action='store_true') parser.add_argument("--train_file", type=str, \ help="SQuAD json for training. E.g., train-v1.1.json", \ default="/home/sewon/data/squad/train-v1.1.json") parser.add_argument("--predict_file", type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", \ default="/home/sewon/data/squad/dev-v1.1.json") parser.add_argument("--init_checkpoint", type=str, help="Initial checkpoint (usually from a pre-trained BERT model).", \ default=BERT_DIR+"pytorch_model.bin") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument( "--max_seq_length", default=300, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=39, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=300, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=1000.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument( "--n_best_size", default=3, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument('--eval_period', type=int, default=500) parser.add_argument('--max_n_answers', type=int, default=20) parser.add_argument('--n_paragraphs', type=str, default='40') parser.add_argument('--verbose', action="store_true", default=False) parser.add_argument('--wait_step', type=int, default=12) # Learning method variation parser.add_argument('--loss_type', type=str, default="mml") parser.add_argument('--tau', type=float, default=12000.0) # For evaluation parser.add_argument('--prefix', type=str, default="") #500 parser.add_argument('--debug', action="store_true", default=False) args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO, handlers=[ logging.FileHandler(os.path.join(args.output_dir, "log.txt")), logging.StreamHandler() ]) logger = logging.getLogger(__name__) logger.info(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if not args.predict_file: raise ValueError( "If `do_train` is True, then `predict_file` must be specified." ) if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.do_train and args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) model = BertForQuestionAnswering(bert_config, device, 4, loss_type=args.loss_type, tau=args.tau) metric_name = "EM" tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None train_split = ',' in args.train_file if train_split: n_train_files = len(args.train_file.split(',')) eval_dataloader, eval_examples, eval_features, _ = get_dataloader( logger=logger, args=args, input_file=args.predict_file, is_training=False, batch_size=args.predict_batch_size, num_epochs=1, tokenizer=tokenizer) if args.do_train: train_file = args.train_file if train_split: train_file = args.train_file.split(',')[0] train_dataloader, _, _, num_train_steps = get_dataloader( logger=logger, args=args, \ input_file=train_file, \ is_training=True, batch_size=args.train_batch_size, num_epochs=args.num_train_epochs, tokenizer=tokenizer) if args.init_checkpoint is not None: logger.info("Loading from {}".format(args.init_checkpoint)) state_dict = torch.load(args.init_checkpoint, map_location='cpu') if args.do_train and args.init_checkpoint.endswith( 'pytorch_model.bin'): model.bert.load_state_dict(state_dict) else: filter = lambda x: x[7:] if x.startswith('module.') else x state_dict = {filter(k): v for (k, v) in state_dict.items()} model.load_state_dict(state_dict) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 best_f1 = (-1, -1) wait_step = 0 model.train() global_step = 0 stop_training = False train_losses = [] for epoch in range(int(args.num_train_epochs)): if epoch > 0 and train_split: train_file = args.train_file.split(',')[epoch % n_train_files] train_dataloader = get_dataloader( logger=logger, args=args, \ input_file=train_file, \ is_training=True, batch_size=args.train_batch_size, num_epochs=args.num_train_epochs, tokenizer=tokenizer)[0] for step, batch in enumerate(train_dataloader): global_step += 1 batch = [t.to(device) for t in batch] loss = model(batch, global_step) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps train_losses.append(loss.detach().cpu()) loss.backward() if global_step % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() if global_step % args.eval_period == 0: model.eval() f1 = predict(logger, args, model, eval_dataloader, eval_examples, eval_features, \ device, write_prediction=False) logger.info( "Step %d Train loss %.2f EM %.2f F1 %.2f on epoch=%d" % (global_step, np.mean(train_losses), f1[0] * 100, f1[1] * 100, epoch)) train_losses = [] if best_f1 < f1: logger.info("Saving model with best %s: %.2f (F1 %.2f) -> %.2f (F1 %.2f) on epoch=%d" % \ (metric_name, best_f1[0]*100, best_f1[1]*100, f1[0]*100, f1[1]*100, epoch)) model_state_dict = { k: v.cpu() for (k, v) in model.state_dict().items() } torch.save( model_state_dict, os.path.join(args.output_dir, "best-model.pt")) model = model.to(device) best_f1 = f1 wait_step = 0 stop_training = False else: wait_step += 1 if wait_step == args.wait_step: stop_training = True model.train() if stop_training: break logger.info("Training finished!") elif args.do_predict: if type(model) == list: model = [m.eval() for m in model] else: model.eval() f1 = predict(logger, args, model, eval_dataloader, eval_examples, eval_features, device, varying_n_paragraphs=len(args.n_paragraphs) > 1)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_config_file", default='model_repo/uncased_L-12_H-768_A-12/bert_config.json', type=str, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default='model_repo/uncased_L-12_H-768_A-12/vocab.txt', type=str, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default='output', type=str, help="The output directory where the model checkpoints will be written.") parser.add_argument("--processed_data", default='processed', type=str, help="The output directory where the model checkpoints will be written.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=True, action='store_true', help="Whether to run eval on the dev set.") ## Other parameters parser.add_argument("--train_file", default='BioASQ-train-factoid-4b.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default='ASQdev.json', type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--init_checkpoint", default='model_repo/uncased_L-12_H-768_A-12/pytorch_model.bin', type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--finetuned_checkpoint", default='ft_dir/ft_model.bin', type=str, help="finetuned checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument("--max_seq_length", default=500, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--max_answer_length", default=500, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=5.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--verbose_logging", default=False, action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--accumulate_gradients", type=int, default=1, help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) if not os.path.exists(args.finetuned_checkpoint): os.makedirs(args.finetuned_checkpoint, exist_ok=True) tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) num_train_steps = None if args.do_train: logger.info('Load and process train examples') if os.path.exists(os.path.join(args.processed_data, 'processed_train.pkl')): with open(os.path.join(args.processed_data, 'processed_train.pkl'), 'rb') as f: train_features, train_examples, num_train_steps = pickle.load(f) else: train_examples = read_squad_examples( input_file=args.train_file, tokenizer=tokenizer, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) train_path = os.path.join(args.processed_data, 'processed_train.pkl') with open(train_path, 'wb') as f: pickle.dump([train_features, train_examples, num_train_steps], f) if args.do_predict: logger.info('Load and process dev examples') if os.path.exists(os.path.join(args.processed_data, 'processed_dev.pkl')): with open(os.path.join(args.processed_data, 'processed_dev.pkl'), 'rb') as f: eval_features, eval_examples = pickle.load(f) else: eval_examples = read_squad_examples( input_file=args.predict_file, tokenizer=tokenizer, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) eval_path = os.path.join(args.processed_data, 'processed_dev.pkl') with open(eval_path, 'wb') as f: pickle.dump([eval_features, eval_examples], f) model = BertForQuestionAnswering(bert_config) if args.do_train and args.init_checkpoint is not None: logger.info('Loading init checkpoint') model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) logger.info('Loaded init checkpoint') elif args.do_predict: logger.info('Loading fine-tuned checkpoint') state_dict = torch.load(args.finetuned_checkpoint, map_location='cpu') new_state_dict = collections.OrderedDict() for key, value in state_dict.items(): new_state_dict[key[7:]] = value model.load_state_dict(new_state_dict) del state_dict del new_state_dict logger.info('Loaded fine-tuned checkpoint') model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() best_dev_score = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if (step + 1) % args.save_checkpoints_steps == 0: best_dev_score = run_evaluate(args, model, eval_features, device, eval_examples, tokenizer, best_dev_score) logger.info('Best dev score {} in steps {}:'.format(best_dev_score, step)) if args.do_predict: run_evaluate(args, model, eval_features, device, eval_examples, tokenizer, best_dev_score )
def main(): BERT_BASE_DIR='../data/weights/cased_L-12_H-768_A-12' parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_config_file", default="{}/bert_config.json".format(BERT_BASE_DIR), type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument("--task_name", default="lm", type=str, required=True, help="The name of the task to train.") parser.add_argument("--vocab_file", default="{}/vocab.txt".format(BERT_BASE_DIR), type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--init_checkpoint", default="{}/pytorch_model.bin".format(BERT_BASE_DIR), type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=True, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=2, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--MBTI', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") args = parser.parse_args() ## Initialization if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) save_path = os.path.join(args.output_dir, 'state_dict.pkl') print("save_path": save_path) ## Load Data tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) decoder = {v:k for k,v in tokenizer.wordpiece_tokenizer.vocab.items()} processors = { "lm": LMProcessor, } task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name](tokenizer=tokenizer) label_list = processor.get_labels() train_examples = processor.get_train_examples(args.data_dir, skip=30, tqdm=tqdm) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) train_features = convert_tokens_to_features( train_examples, label_list, args.max_seq_length, tokenizer, tqdm=tqdm) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_label_weights = torch.tensor([f.label_weights for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_label_weights) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) ## Load Model model = BertForMaskedLanguageModelling(bert_config) if args.init_checkpoint is not None: model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) if os.path.isfile(save_path): model.load_state_dict(torch.load(save_path, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) print(model) ## Initialize Optimizer no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) ## Training val_test="""The next day I was somewhat somnolent, of which you may be sure Miss Frankland took no notice. She retired to her own room when we went for our recreation. My friends scolded me for not coming to them the previous night, but I told them that my parents had continued to move about her room for so long a time that I had fallen fast asleep, and even then had not had enough, as they might have observed how sleepy I had been all day.""" global_step = 0 model.train() for _ in tqdm(range(int(args.num_train_epochs)), desc="Epoch"): tr_loss, nb_tr_examples, nb_tr_steps = 0, 0, 0 with tqdm(total=len(train_dataloader), desc='Iteration', mininterval=60) as prog: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, label_weights = batch loss, logits = model(input_ids, segment_ids, input_mask, label_ids, label_weights) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enougth gradients model.zero_grad() prog.update(1) prog.desc = 'Iter. loss={:2.6f}'.format(tr_loss/nb_tr_examples) if step%3000==10: print('step', step, 'loss', tr_loss/nb_tr_examples) display(predict_masked_words(val_test, processor, tokenizer, model, device=device, max_seq_length=args.max_seq_length)) display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=10, device=device)) tr_loss, nb_tr_examples, nb_tr_steps = 0, 0, 0 torch.save(model.state_dict(), save_path) global_step += 1 ## Save torch.save(model.state_dict(), save_path) ## Test display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=10, device=device, debug=False))
def train(args): processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, 'sst': SstProcessor, 'polarity': PolarityProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) model = BertForSequenceClassification(bert_config, len(label_list)) if args.do_test: model.load_state_dict(torch.load(os.path.join(args.data_dir, "model_best.pt"))) model.to(device) model.eval() if args.do_eval: eval_example = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_example, label_list, args.max_seq_length, tokenizer) logger.info("***** Running eval *****") logger.info(" Num examples = %d", len(eval_example)) logger.info(" Batch size = %d", args.eval_batch_size) _ = do_eval(model, device, eval_features, args) if args.do_predict: print(os.path.dirname(args.data_dir)) test_example = processor.get_test_examples(os.path.dirname(args.data_dir)) test_features = convert_examples_to_features( test_example, label_list, args.max_seq_length, tokenizer) logger.info("***** Running test *****") logger.info(" Num examples = %d", len(test_example)) logger.info(" Batch size = %d", args.eval_batch_size) test_logit_list, _ = do_eval(model, device, test_features, args) np.save(os.path.join(args.data_dir, 'oof_test'), np.asarray(test_logit_list)) return 0 if args.init_checkpoint is not None: model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) max_score = 0 for epoch in trange(int(args.num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, (input_ids, input_mask, segment_ids, label_ids) in enumerate( tqdm(train_dataloader, desc="Iteration")): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logit_list = [] labels_eval_list = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() logit_list.extend(logits.tolist()) labels_eval_list.extend(label_ids.tolist()) tmp_eval_accuracy = accuracy(logits, label_ids) # _ = accuracy2(logits, label_ids) # _ = accuracy3(logits, label_ids) # _ = accuracy4(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 # print(epoch) # f1 = score2(logit_list, labels_eval_list) eval_loss = eval_loss / nb_eval_steps # len(eval_dataloader) eval_accuracy = eval_accuracy / nb_eval_examples # len(eval_dataloader) print("eval_loss", eval_loss) result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'Epoch': epoch, 'Dir': args.data_dir } # 'loss': loss.item()} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if eval_accuracy > max_score: # best_logit_list = logit_list # best_eval_labels = labels_eval_list np.save(os.path.join(args.data_dir, 'oof_train'), np.asarray(logit_list)) np.save(os.path.join(args.data_dir, 'oof_train_y'), np.asarray(labels_eval_list)) torch.save(model.state_dict(), os.path.join(args.data_dir, "model_%.4f.pt" % eval_accuracy)) if os.path.exists(os.path.join(args.data_dir, "model_best.pt")): os.remove(os.path.join(args.data_dir, "model_best.pt")) torch.save(model.state_dict(), os.path.join(args.data_dir, "model_best.pt")) max_score = eval_accuracy
def main(): args = arguments.get_argparse("multiple_choice") logger.info(json.dumps(args.__dict__)) if args.eval_on_train and not args.log_spec: args.log_spec = "on_train" processors = { "race": dataset_processor.RaceProcessor, "mctest": dataset_processor.MCTestProcessor, "swag": dataset_processor.SwagProcessor, "squad": dataset_processor.SquadProcessor, "openbookqa": dataset_processor.OpenBookQAProcessor, "multirc": dataset_processor.MultiRCProcessor, "arc": dataset_processor.ARCProcessor, "qa4mre": dataset_processor.QA4MREProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend # which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") if args.fp16: logger.info( """16-bits training currently not supported in distributed training""") # (see https://github.com/pytorch/pytorch/pull/13496) args.fp16 = False logger.info( "device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1), ) if args.gradient_accumulation_steps < 1: raise ValueError("""Invalid gradient_accumulation_steps parameter: {}, should be >= 1""".format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( """At least one of `do_train` or `do_eval` or `do_test` must be True.""") if (args.do_train or args.do_eval) and args.do_test: raise ValueError( "Runing test must be independent of running train and/or dev") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( """Cannot use sequence length {} because the BERT model was only trained up to sequence length {}""".format( args.max_seq_length, bert_config.max_position_embeddings)) if args.small_debug: args.output_dir = 'debug' if os.path.exists(args.output_dir): if not os.listdir(args.output_dir) == ["args_log.txt" ] and not args.small_debug: raise ValueError( "Output directory already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.cache_dir, exist_ok=True) args_log = os.path.join(args.output_dir, "args_log.txt") if not os.path.exists(args_log): with open(args_log, "w") as writer: writer.write(json.dumps(args.__dict__)) else: print("args_log.txt already exists") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) if "{}" in args.corenlp_cache_dir: args.corenlp_cache_dir = args.corenlp_cache_dir.format(task_name) processor = processors[task_name](args.data_dir, args.dataset_option) num_options = processor.get_num_options() if args.convert_from_ans_extr: if args.do_train: if args.train_predictions: processor.set_candidates("train", args.train_predictions) else: raise ValueError("train prediction file is missing") if args.do_eval: if args.eval_predictions: processor.set_candidates("dev", args.eval_predictions) else: raise ValueError("eval prediction file is missing") tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None def cache_features(examples, split_name): cache_spec_cand = [ task_name, args.dataset_option, split_name, args.input_ablation, ] cache_spec = "_".join( [str(x) for x in cache_spec_cand if x is not None]) cache_path = os.path.join(args.cache_dir, "{}.pkl".format(cache_spec)) if os.path.exists(cache_path) and not args.no_cache: features = pickle.load(open(cache_path, "rb")) else: if args.input_ablation or args.output_statistics: corenlp_cache_path = os.path.join( args.corenlp_cache_dir, "{}_{}.pkl".format(task_name, split_name)) corenlp_cache = pickle.load(open(corenlp_cache_path, "rb")) else: corenlp_cache = None if args.output_statistics: output_mcmrc.output_statistics(examples, corenlp_cache) tokenized_examples = generate_tokenized_examples( examples, tokenizer, args.input_ablation, corenlp_cache, args.entity_anonymization) if args.output_mturk or args.output_examples: for ex in examples: ex.input_ablation = "original" original_examples = generate_tokenized_examples( examples, tokenizer, None, None) if args.output_examples: output_mcmrc.output_examples( tokenized_examples, original_examples, task_name, 'ent_anon' if args.entity_anonymization else args.input_ablation, ) if args.output_mturk: output_mcmrc.output_mturk(tokenized_examples, original_examples, task_name, args.input_ablation) exit(1) features = convert_examples_to_features( tokenized_examples, num_options, args.max_seq_length, args.max_query_length, args.max_option_length, tokenizer, ) if not args.no_cache: with open(cache_path, "wb") as f: pickle.dump(features, f) # assert len(examples) == len(features) return features if args.do_train: train_examples = processor.get_train_examples() if args.small_debug: train_examples = train_examples[:6000] num_train_per_epoch = len(train_examples) num_train_per_epoch /= args.train_batch_size num_train_per_epoch /= args.gradient_accumulation_steps num_train_steps = int(num_train_per_epoch * args.num_train_epochs) train_features = cache_features(train_examples, "train") if args.do_eval: if args.eval_on_train: eval_examples = processor.get_train_examples() eval_features = cache_features(eval_examples, "train") else: eval_examples = processor.get_dev_examples() if args.small_debug: eval_examples = eval_examples[:1000] eval_features = cache_features(eval_examples, "dev") if args.do_test: eval_examples = processor.get_test_examples() eval_features = cache_features(eval_examples, "test") global entity_set if args.entity_anonymization: if len(entity_set) == 0: anon_tag_cache_file = os.path.join( args.cache_dir, f'{task_name}_anon_tags_{args.entity_anonymization}.pkl') if not os.path.exists(anon_tag_cache_file): raise ValueError("vocabulary cache cannot be loaded") entity_set = pickle.load(open(anon_tag_cache_file, 'rb')) tokenizer.vocab_update(sorted(entity_set)) else: anon_tag_cache_file = os.path.join( args.cache_dir, f'{task_name}_anon_tags_{args.entity_anonymization}.pkl') if not os.path.exists(anon_tag_cache_file): with open(anon_tag_cache_file, 'wb') as f: pickle.dump(entity_set, f) # Prepare model model = BertForMultipleChoice(bert_config, num_options) if args.init_checkpoint is not None: state_dict = torch.load(args.init_checkpoint, map_location="cpu") if list(state_dict)[0].startswith("bert."): # finetuned on some target dataset model.load_state_dict(state_dict) else: # pretrained language model model.bert.load_state_dict(state_dict) if args.entity_anonymization and len(entity_set): model.bert.embeddings.extend_word_embeddings(len(entity_set)) if args.limit_vocab_size or args.limit_vocab_freq: use_vocab, train_features, eval_features = vocab_selection( train_features, eval_features, args.cache_dir, args.output_dir, task_name, tokenizer, args.entity_anonymization, args.limit_vocab_size, args.limit_vocab_freq, num_options=num_options, ) id_to_token = {v: k for k, v in tokenizer.vocab.items()} use_tokens = [id_to_token[i] for i in use_vocab] logger.info(sorted(use_tokens)) logger.info(f'{len(use_tokens)}') model.bert.embeddings.limit_vocab(use_vocab) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [ (n, param.clone().detach().to("cpu").float().requires_grad_()) for n, param in model.named_parameters() ] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to("cpu").requires_grad_()) for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ["bias", "gamma", "beta"] optimizer_grouped_parameters = [ { "params": [p for n, p in param_optimizer if n not in no_decay], "weight_decay_rate": 0.01, }, { "params": [p for n, p in param_optimizer if n in no_decay], "weight_decay_rate": 0.0, }, ] optimizer = BERTAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps, ) global_step = 0 if args.enter_debugger: model.eval() # features = convert_examples_to_features( # eval_examples, # num_options, # args.max_seq_length, # args.max_query_length, # args.max_option_length, # tokenizer, # ) # output = get_predictions( # model, eval_examples, features, args, device # ) # output_logits, output_predictions, eval_loss, eval_accuracy = output print("in debugger") import pdb pdb.set_trace() def eval_func(num_epoch=-1, num_step=-1, log_spec=None): logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() output = get_predictions(model, eval_examples, eval_features, args, device) output_logits, output_predictions, eval_loss, eval_accuracy = output model.train() output_qids = [e.qid for e in eval_examples] output_answers = [e.ans_idx for e in eval_examples] result = { "eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "global_step": global_step, } output_spec = "" if num_epoch > -1 and num_step > -1: output_spec = "_{}_{}".format(num_epoch, num_step) elif log_spec: output_spec += "_{}".format(log_spec) output_eval_file = os.path.join( args.output_dir, "eval_results{}.json".format(output_spec)) result["spec"] = output_spec logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) with open(output_eval_file, "w") as writer: writer.write(json.dumps(result)) output_pred_file = os.path.join( args.output_dir, "eval_preds{}.jsonl".format(output_spec)) with open(output_pred_file, "w") as f: for qid, ans, pred, logit in zip(output_qids, output_answers, output_predictions, output_logits): result = { "qid": qid, "answer": chr(ans + ord("A")), "prediction": chr(pred + ord("A")), "logits": logit.tolist(), } f.write(json.dumps(result) + "\n") if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for i in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 tmp_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/.. # ..sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data /= args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info("""FP16 TRAINING: Nan in gradients, reducing loss scaling""") args.loss_scale /= 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if global_step % args.save_model_steps == 0: output_model_file = os.path.join( args.output_dir, "pytorch_model_step_{}.bin".format(global_step), ) if n_gpu > 1: torch.save(model.module.state_dict(), output_model_file) else: torch.save(model.state_dict(), output_model_file) tmp_loss += loss.item() if (args.loss_report_steps > 0 and global_step % args.loss_report_steps == 0): logger.info("Step loss: {}".format( tmp_loss / args.loss_report_steps)) tmp_loss = 0 if (args.eval_steps > 0 and global_step > 0 and global_step % args.eval_steps == 0 and args.do_eval): eval_func(i, global_step, args.log_spec) output_model_file = os.path.join( args.output_dir, "pytorch_model_epoch_{}.bin".format(i)) if n_gpu > 1: torch.save(model.module.state_dict(), output_model_file) else: torch.save(model.state_dict(), output_model_file) if args.do_eval: eval_func(i, global_step, args.log_spec) if not args.do_train and args.do_eval: eval_func(log_spec=args.log_spec or "dev") if args.do_test: eval_func(log_spec=args.log_spec or "test")
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help= "The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) # Other parameters parser.add_argument( "--load_all", default=False, action='store_true', help="Whether to load all parameter or only for bert part.") parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--source", default=None, type=str, help="Start point for finetune.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--multi", default=False, help="Whether to add adapter modules", action='store_true') parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument( "--optim", default='normal', help= "Whether to split up the optimiser between adapters and not adapters.") parser.add_argument( "--sample", default='rr', help="How to sample tasks, other options 'prop', 'sqrt' or 'anneal'") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--h_aug", default="n/a", help="Size of hidden state for adapters..") parser.add_argument("--tasks", default="all", type=str, help="Which set of tasks to train on.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--freeze", default=False, action='store_true', help="Freeze base network weights") parser.add_argument("--freeze_regex", default='no', help="Freeze code") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) args = parser.parse_args() # config logger task_names = args.tasks.split(',') output_dir = os.path.join( args.output_dir, args.source + '_TO_' + '_'.join(task_names) + '_' + os.path.basename(args.bert_config_file).replace('.json', '') + '_' + args.freeze_regex + '_' + uuid.uuid4().hex[:8]) tf_writer = SummaryWriter(os.path.join(output_dir, 'log')) json.dump(vars(args), open(os.path.join(output_dir, 'run_config.json'), 'w'), indent=2) os.makedirs(output_dir, exist_ok=True) log_file = os.path.join(output_dir, 'std.log') if log_file: logfile = logging.FileHandler(log_file, 'w') logger.addHandler(logfile) processors = { "mnli": MnliProcessor, "mrpc": MrpcProcessor, "rte": RTEProcessor, "sts": STSProcessor, "qqp": QQPProcessor, "qnli": QNLIProcessor, "snli": SNLIProcessor, "scitail": ScitailProcessor, "wnli": WnliProcessor, "msmarco": MsMarcoProcessor, "wikiqa": WikiqaProcessor } task_id_mappings = { 'mnli': 0, 'mrpc': 1, 'rte': 2, 'sts': 3, 'qqp': 4, 'qnli': 5, 'snli': 6, 'scitail': 7, 'wnli': 8, "msmarco": 9, "wikiqa": 10 } task_num_labels = [3, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2] device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) os.makedirs(args.output_dir, exist_ok=True) task_names = args.tasks.split(',') if args.source != None: output_dir = os.path.join( args.output_dir, args.source + '_TO_' + '_'.join(task_names) + '_' + os.path.basename(args.bert_config_file).replace('.json', '') + '_' + uuid.uuid4().hex[:8]) else: output_dir = os.path.join( args.output_dir, '_'.join(task_names) + '_' + os.path.basename(args.bert_config_file).replace('.json', '')) tf_writer = SummaryWriter(os.path.join(output_dir, 'log')) json.dump(vars(args), open(os.path.join(output_dir, 'run_config.json'), 'w'), indent=2) os.makedirs(output_dir, exist_ok=True) processor_list = [processors[task_name]() for task_name in task_names] label_list = [processor.get_labels() for processor in processor_list] tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None num_tasks = len(task_names) if args.do_train: train_examples = [ processor.get_train_examples(args.data_dir + data_dir) for processor, data_dir in zip(processor_list, task_names) ] num_train_examples = [len(tr) for tr in train_examples] total_train_examples = sum(num_train_examples) num_train_steps = int(total_train_examples / args.train_batch_size * args.num_train_epochs) total_tr = num_train_steps steps_per_epoch = int(num_train_steps / args.num_train_epochs) if args.h_aug is not 'n/a': bert_config.hidden_size_aug = int(args.h_aug) model = BertForMultiNLI(bert_config, task_num_labels) if args.init_checkpoint is not None: if args.load_all: missing_keys, unexpected_keys = model.load_state_dict(torch.load( args.init_checkpoint, map_location='cpu'), strict=False) logger.info('missing keys: {}'.format(missing_keys)) logger.info('unexpected keys: {}'.format(unexpected_keys)) elif args.multi: partial = torch.load(args.init_checkpoint, map_location='cpu') model_dict = model.bert.state_dict() update = {} for n, p in model_dict.items(): if 'aug' in n or 'mult' in n: update[n] = p if 'pooler.mult' in n and 'bias' in n: update[n] = partial['pooler.dense.bias'] if 'pooler.mult' in n and 'weight' in n: update[n] = partial['pooler.dense.weight'] else: update[n] = partial[n] model.bert.load_state_dict(update) else: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) # Only initialized bert part params which has no 'bert' prefix bert_partial = torch.load(args.init_checkpoint, map_location='cpu') model_dict = model.state_dict() update = {} for n, p in model_dict.items(): if 'bert' in n: update[n[5:]] = bert_partial[n[5:]] missing_keys, unexpected_keys = model.bert.load_state_dict( update, strict=False) logger.info('missing keys: {}'.format(missing_keys)) logger.info('unexpected keys: {}'.format(unexpected_keys)) if args.freeze: for n, p in model.bert.named_parameters(): if 'aug' in n or 'classifier' in n or 'mult' in n or 'gamma' in n or 'beta' in n: continue p.requires_grad = False def freeze_by_layer(layerno): freeze_layers = list(range(12)) freeze_layers.remove(int(layerno)) freeze_layers = [ 'encoder.layer.{}.'.format(no) for no in freeze_layers ] for n, p in model.bert.named_parameters(): if 'embeddings' in n: p.requires_grad = False if 'pooler' in n: p.requires_grad = False for freeze_layer in freeze_layers: if n.startswith(freeze_layer): p.requires_grad = False for n, p in model.bert.named_parameters(): logger.info('{}\t{}'.format(p.requires_grad, n)) if args.freeze_regex in [str(x) for x in range(11)]: logger.info('Tune some layer!') freeze_by_layer(args.freeze_regex) if args.freeze_regex == 'all': logger.info('Tune all bias parameters!') for n, p in model.bert.named_parameters(): if "bias" not in n: p.requires_grad = False non_tuned += p.numel() else: tuned += p.numel() if args.freeze_regex == 'attention_bias': logger.info('Tune all attetnion bias parameters!') for n, p in model.bert.named_parameters(): if "bias" in n and 'attention' in n: tuned += p.numel() else: p.requires_grad = False non_tuned += p.numel() if args.freeze_regex == 'linear_bias': logger.info('Tune all linear bias parameters!') for n, p in model.bert.named_parameters(): if "bias" in n and ('output' in n or 'intermediate' in n): tuned += p.numel() else: p.requires_grad = False non_tuned += p.numel() if args.freeze_regex == 'layer_norm': logger.info('Tune all layer norm bias parameters!') for n, p in model.bert.named_parameters(): if 'gamma' in n or 'beta' in n: tuned += p.numel() else: p.requires_grad = False non_tuned += p.numel() if args.freeze_regex == 'attn_self': logger.info('Tune all layer attention parameters!') for n, p in model.bert.named_parameters(): if 'attention' in n: tuned += p.numel() else: p.requires_grad = False non_tuned += p.numel() for n, p in model.bert.named_parameters(): logger.info('{}\t{}'.format(p.requires_grad, n)) logger.info('tuned:{}({}), not tuned: {}'.format(tuned, round(tuned / total, 6), non_tuned)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.optim == 'normal': no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=total_tr) else: no_decay = ['bias', 'gamma', 'beta'] base = ['attn'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in base) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in base) ], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=total_tr) optimizer_parameters_mult = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in base) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in base) ], 'weight_decay_rate': 0.0 }] optimizer_mult = BERTAdam(optimizer_parameters_mult, lr=3e-4, warmup=args.warmup_proportion, t_total=total_tr) if args.do_eval: eval_loaders = [] for i, task in enumerate(task_names): eval_examples = processor_list[i].get_dev_examples(args.data_dir + task_names[i]) eval_features = convert_examples_to_features( eval_examples, label_list[i], args.max_seq_length, tokenizer, task) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_loaders.append( DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)) global_step = 0 if args.do_train: loaders = [] logger.info(" Num Tasks = %d", len(train_examples)) for i, task in enumerate(task_names): train_features = convert_examples_to_features( train_examples[i], label_list[i], args.max_seq_length, tokenizer, task) logger.info("***** training data for %s *****", task) logger.info(" Data size = %d", len(train_features)) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) loaders.append( iter( DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size))) total_params = sum(p.numel() for p in model.parameters()) logger.info(" Num param = {}".format(total_params)) loaders = [cycle(it) for it in loaders] model.train() best_score = 0. if args.sample == 'sqrt' or args.sample == 'prop': probs = num_train_examples if args.sample == 'prop': alpha = 1. if args.sample == 'sqrt': alpha = 0.5 probs = [p**alpha for p in probs] tot = sum(probs) probs = [p / tot for p in probs] epoch = 0 tr_loss = [0. for i in range(num_tasks)] nb_tr_steps = [0 for i in range(num_tasks)] nb_tr_instances = [0 for i in range(num_tasks)] for _ in trange(int(args.num_train_epochs), desc="Epoch"): if args.sample == 'anneal': probs = num_train_examples alpha = 1. - 0.8 * epoch / (args.num_train_epochs - 1) probs = [p**alpha for p in probs] tot = sum(probs) probs = [p / tot for p in probs] for step in tqdm(range(steps_per_epoch)): task_index = np.random.choice(len(task_names), p=probs) task_model_index = task_id_mappings[task_names[task_index]] batch = next(loaders[task_index]) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, task_model_index, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss[task_index] += loss.item() * input_ids.size(0) nb_tr_instances[task_index] += input_ids.size(0) nb_tr_steps[task_index] += 1 # if step % 1000 < num_tasks: # logger.info("Task: {}, Step: {}".format(task_names[task_index], step)) # logger.info("Loss: {}".format(tr_loss[task_index]/nb_tr_instances[task_index])) tf_writer.add_scalar( 'loss/{}'.format(task_names[task_index]), tr_loss[task_index] / nb_tr_instances[task_index], nb_tr_steps[task_index]) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients if args.optim != 'normal': optimizer_mult.step() model.zero_grad() global_step += 1 epoch += 1 ev_acc = 0. for i, task in enumerate(task_names): ev_acc += do_eval(model, logger, args, device, tr_loss[i], nb_tr_steps[i], global_step, processor_list[i], label_list[i], tokenizer, eval_loaders[i], task_id_mappings[task], task_names, task, output_dir) logger.info("Total acc: {}".format(ev_acc)) if ev_acc > best_score: best_score = ev_acc model_dir = os.path.join(output_dir, "best_model.pth") torch.save(model.state_dict(), model_dir) output_eval_file = os.path.join(output_dir, "best_eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Best Eval results *****") writer.write("%s = %s\n" % ('best_score', ev_acc))
class Instructor: def __init__(self, args): self.opt = args self.writer = SummaryWriter(log_dir=self.opt.output_dir) # tensorboard bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) # os.makedirs(args.output_dir, exist_ok=True) self.dataset = ReadData(self.opt) # Read the data and preprocess it self.num_train_steps = None self.num_train_steps = int( len(self.dataset.train_examples) / self.opt.train_batch_size / self.opt.gradient_accumulation_steps * self.opt.num_train_epochs) self.opt.label_size = len(self.dataset.label_list) args.output_dim = len(self.dataset.label_list) print("label size: {}".format(args.output_dim)) # 初始化模型 print("initialize model ...") if args.model_class == BertForSequenceClassification: self.model = BertForSequenceClassification( bert_config, len(self.dataset.label_list)) else: self.model = model_classes[args.model_name](bert_config, args) if args.init_checkpoint is not None: self.model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) if args.fp16: self.model.half() # 冻结参数 # for name, p in self.model.named_parameters(): # if name.startswith('bert.encoder.layer.11') or name.startswith('bert.encoder.layer.10') or name.startswith('bert.encoder.layer.9') or name.startswith('bert.encoder.layer.8'): # 冻结最后一层 # p.requires_grad = False # 计算模型的参数个数 n_trainable_params, n_nontrainable_params = 0, 0 for p in self.model.parameters(): n_params = torch.prod(torch.tensor( p.shape)) # torch.prod()表示计算所有元素的乘积 if p.requires_grad: # 是否需要求梯度 n_trainable_params += n_params else: n_nontrainable_params += n_params print('n_trainable_params: {0}, n_nontrainable_params: {1}'.format( n_trainable_params, n_nontrainable_params)) self.model.to(args.device) # 并行化 if self.opt.n_gpu > 1: self.model = torch.nn.DataParallel(self.model, device_ids=self.opt.gpu_id) # Prepare optimizer if args.fp16: self.param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in self.model.named_parameters()] elif args.optimize_on_cpu: self.param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in self.model.named_parameters()] else: self.param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in self.param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in self.param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] self.optimizer = BERTAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=self.num_train_steps) # 配置自己模型的优化器 # [p for pname, p in self.param_optimizer if not pname.startswith('module.bert')] # self.optimizer_me = torch.optim.Adam( # [{'params': [p for pname, p in self.param_optimizer if not pname.startswith('module.bert')]}], lr=0.001, # weight_decay=0) # self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer_me, mode='max', # patience=3) # 3个epoch后,所监测的值停止增加时自动调整学习率 self.global_step = 0 # 初始化全局步数为 0 self.max_test_acc = 0 self.max_test_f1 = 0 def do_train(self): # 训练模型 # for _ in trange(int(args.num_train_epochs), desc="Epoch"): for i_epoch in range(int(args.num_train_epochs)): print('>' * 100) print('epoch: ', i_epoch) tr_loss = 0 train_accuracy = 0 nb_tr_examples, nb_tr_steps = 0, 0 y_pred = [] y_true = [] self.model.train() # 让模型处于训练状态,因为每跑完一个epoch就会处于测试状态 for step, batch in enumerate( tqdm(self.dataset.train_dataloader, desc="Training")): # batch = tuple(t.to(self.opt.device) for t in batch) input_ids, input_mask, segment_ids, label_ids, \ input_t_ids, input_t_mask, segment_t_ids, \ input_without_t_ids, input_without_t_mask, segment_without_t_ids, \ input_left_t_ids, input_left_t_mask, segment_left_t_ids, \ input_right_t_ids, input_right_t_mask, segment_right_t_ids, \ input_left_ids, input_left_mask, segment_left_ids = batch input_ids = input_ids.to(self.opt.device) segment_ids = segment_ids.to(self.opt.device) input_mask = input_mask.to(self.opt.device) label_ids = label_ids.to(self.opt.device) if self.opt.model_class in [ BertForSequenceClassification, CNN ]: loss, logits = self.model(input_ids, segment_ids, input_mask, label_ids) else: input_t_ids = input_t_ids.to(self.opt.device) input_t_mask = input_t_mask.to(self.opt.device) segment_t_ids = segment_t_ids.to(self.opt.device) if self.opt.model_class == MemNet: input_without_t_ids = input_without_t_ids.to( self.opt.device) input_without_t_mask = input_without_t_mask.to( self.opt.device) segment_without_t_ids = segment_without_t_ids.to( self.opt.device) loss, logits = self.model(input_without_t_ids, segment_without_t_ids, input_without_t_mask, label_ids, input_t_ids, input_t_mask, segment_t_ids) elif self.opt.model_class in [Cabasc]: input_left_t_ids = input_left_t_ids.to(self.opt.device) input_left_t_mask = input_left_t_mask.to( self.opt.device) segment_left_t_ids = segment_left_t_ids.to( self.opt.device) input_right_t_ids = input_right_t_ids.to( self.opt.device) input_right_t_mask = input_right_t_mask.to( self.opt.device) segment_right_t_ids = segment_right_t_ids.to( self.opt.device) loss, logits = self.model( input_ids, segment_ids, input_mask, label_ids, input_t_ids, input_t_mask, segment_t_ids, input_left_t_ids, input_left_t_mask, segment_left_t_ids, input_right_t_ids, input_right_t_mask, segment_right_t_ids) elif self.opt.model_class in [ RAM, TNet_LF, MGAN, TT, MLP, TD_BERT, TD_BERT_QA, DTD_BERT ]: input_left_ids = input_left_ids.to(self.opt.device) input_left_mask = input_left_mask.to(self.opt.device) segment_left_ids = segment_left_ids.to(self.opt.device) loss, logits = self.model( input_ids, segment_ids, input_mask, label_ids, input_t_ids, input_t_mask, segment_t_ids, input_left_ids, input_left_mask, segment_left_ids) else: loss, logits = self.model(input_ids, segment_ids, input_mask, label_ids, input_t_ids, input_t_mask, segment_t_ids) if self.opt.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() loss.backward() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 # 计算准确率 logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_train_accuracy = accuracy(logits, label_ids) y_pred.extend(np.argmax(logits, axis=1)) y_true.extend(label_ids) train_accuracy += tmp_train_accuracy if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in self.model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( self.param_optimizer, self.model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 self.model.zero_grad() continue self.optimizer.step() # self.optimizer_me.step() copy_optimizer_params_to_model( self.model.named_parameters(), self.param_optimizer) else: self.optimizer.step() # self.optimizer_me.step() self.model.zero_grad() self.global_step += 1 train_accuracy = train_accuracy / nb_tr_examples train_f1 = f1_score(y_true, y_pred, average='macro', labels=np.unique(y_true)) result = self.do_eval() # 每跑完一轮,测试一次 tr_loss = tr_loss / nb_tr_steps # self.scheduler.step(result['eval_accuracy']) # 监测验证集的精度 self.writer.add_scalar('train_loss', tr_loss, i_epoch) self.writer.add_scalar('train_accuracy', train_accuracy, i_epoch) self.writer.add_scalar('eval_accuracy', result['eval_accuracy'], i_epoch) self.writer.add_scalar('eval_loss', result['eval_loss'], i_epoch) # self.writer.add_scalar('lr', self.optimizer_me.param_groups[0]['lr'], i_epoch) print( "Results: train_acc: {0:.6f} | train_f1: {1:.6f} | train_loss: {2:.6f} | eval_accuracy: {3:.6f} | eval_loss: {4:.6f} | eval_f1: {5:.6f} | max_test_acc: {6:.6f} | max_test_f1: {7:.6f}" .format(train_accuracy, train_f1, tr_loss, result['eval_accuracy'], result['eval_loss'], result['eval_f1'], self.max_test_acc, self.max_test_f1)) def do_eval(self): # 测试准确率 self.model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 # confidence = [] y_pred = [] y_true = [] for batch in tqdm(self.dataset.eval_dataloader, desc="Evaluating"): # batch = tuple(t.to(self.opt.device) for t in batch) input_ids, input_mask, segment_ids, label_ids, \ input_t_ids, input_t_mask, segment_t_ids, \ input_without_t_ids, input_without_t_mask, segment_without_t_ids, \ input_left_t_ids, input_left_t_mask, segment_left_t_ids, \ input_right_t_ids, input_right_t_mask, segment_right_t_ids, \ input_left_ids, input_left_mask, segment_left_ids = batch input_ids = input_ids.to(self.opt.device) segment_ids = segment_ids.to(self.opt.device) input_mask = input_mask.to(self.opt.device) label_ids = label_ids.to(self.opt.device) with torch.no_grad(): # 不计算梯度 if self.opt.model_class in [ BertForSequenceClassification, CNN ]: loss, logits = self.model(input_ids, segment_ids, input_mask, label_ids) else: input_t_ids = input_t_ids.to(self.opt.device) input_t_mask = input_t_mask.to(self.opt.device) segment_t_ids = segment_t_ids.to(self.opt.device) if self.opt.model_class == MemNet: input_without_t_ids = input_without_t_ids.to( self.opt.device) input_without_t_mask = input_without_t_mask.to( self.opt.device) segment_without_t_ids = segment_without_t_ids.to( self.opt.device) loss, logits = self.model(input_without_t_ids, segment_without_t_ids, input_without_t_mask, label_ids, input_t_ids, input_t_mask, segment_t_ids) elif self.opt.model_class in [Cabasc]: input_left_t_ids = input_left_t_ids.to(self.opt.device) input_left_t_ids = input_left_t_ids.to(self.opt.device) input_left_t_mask = input_left_t_mask.to( self.opt.device) segment_left_t_ids = segment_left_t_ids.to( self.opt.device) input_right_t_ids = input_right_t_ids.to( self.opt.device) input_right_t_mask = input_right_t_mask.to( self.opt.device) segment_right_t_ids = segment_right_t_ids.to( self.opt.device) loss, logits = self.model( input_ids, segment_ids, input_mask, label_ids, input_t_ids, input_t_mask, segment_t_ids, input_left_t_ids, input_left_t_mask, segment_left_t_ids, input_right_t_ids, input_right_t_mask, segment_right_t_ids) elif self.opt.model_class in [ RAM, TNet_LF, MGAN, TT, MLP, TD_BERT, TD_BERT_QA, DTD_BERT ]: input_left_ids = input_left_ids.to(self.opt.device) input_left_mask = input_left_mask.to(self.opt.device) segment_left_ids = segment_left_ids.to(self.opt.device) loss, logits = self.model( input_ids, segment_ids, input_mask, label_ids, input_t_ids, input_t_mask, segment_t_ids, input_left_ids, input_left_mask, segment_left_ids) else: loss, logits = self.model(input_ids, segment_ids, input_mask, label_ids, input_t_ids, input_t_mask, segment_t_ids) # with torch.no_grad(): # 不计算梯度 # if self.opt.model_class in [BertForSequenceClassification, CNN]: # loss, logits = self.model(input_ids, segment_ids, input_mask, label_ids) # else: # loss, logits = self.model(input_ids, segment_ids, input_mask, labels=label_ids, # input_t_ids=input_t_ids, # input_t_mask=input_t_mask, segment_t_ids=segment_t_ids) # confidence.extend(torch.nn.Softmax(dim=1)(logits)[:, 1].tolist()) # 获取 positive 类的置信度 # loss = F.cross_entropy(logits, label_ids, size_average=False) # 计算mini-batch的loss总和 if self.opt.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) y_pred.extend(np.argmax(logits, axis=1)) y_true.extend(label_ids) # eval_loss += tmp_eval_loss.mean().item() eval_loss += loss.item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 # eval_loss = eval_loss / len(self.dataset.eval_examples) test_f1 = f1_score(y_true, y_pred, average='macro', labels=np.unique(y_true)) eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples if eval_accuracy > self.max_test_acc: self.max_test_acc = eval_accuracy if self.opt.do_predict: # 测试模式才保存模型 torch.save(self.model, self.opt.model_save_path) if test_f1 > self.max_test_f1: self.max_test_f1 = test_f1 result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'eval_f1': test_f1, } # output_eval_file = os.path.join(args.output_dir, "eval_results.txt") # with open(output_eval_file, "w") as writer: # logger.info("***** Eval results *****") # for key in sorted(result.keys()): # logger.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) # print("Eval results ==> eval_accuracy: {0}, eval_loss: {1}, max_test_acc: {2}".format( # result['eval_accuracy'], result['eval_loss'], self.max_test_acc)) return result def do_predict(self): # 加载保存的模型进行预测,获得准确率 # 读测试集的数据 # dataset = ReadData(self.opt) # 这个方法有点冗余了,读取了所有的数据,包括训练集 # Load model saved_model = torch.load(self.opt.model_save_path) saved_model.to(self.opt.device) saved_model.eval() nb_test_examples = 0 test_accuracy = 0 for batch in tqdm(self.dataset.eval_dataloader, desc="Testing"): batch = tuple(t.to(self.opt.device) for t in batch) input_ids, input_mask, segment_ids, label_ids, input_t_ids, input_t_mask, segment_t_ids = batch with torch.no_grad(): # Do not calculate gradient if self.opt.model_class in [ BertForSequenceClassification, CNN ]: _, logits = saved_model(input_ids, segment_ids, input_mask, label_ids) else: _, logits = saved_model(input_ids, segment_ids, input_mask, labels=label_ids, input_t_ids=input_t_ids, input_t_mask=input_t_mask, segment_t_ids=segment_t_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_test_accuracy = accuracy(logits, label_ids) test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) test_accuracy = test_accuracy / nb_test_examples return test_accuracy def run(self): print('> training arguments:') for arg in vars(self.opt): print('>>> {0}: {1}'.format(arg, getattr(self.opt, arg))) self.do_train() print('>' * 100) if self.opt.do_predict: test_accuracy = self.do_predict() print("Test Set Accuracy: {}".format(test_accuracy)) print("Max validate Set Acc: {0}".format( self.max_test_acc)) # Output the final test accuracy self.writer.close() # return self.max_test_acc return self.max_test_f1
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default='data/semeval2015/three_joint/TO/', type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--output_dir", default='results/semeval2015/three_joint/TO/my_result', type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--vocab_file", default='uncased_L-12_H-768_A-12/vocab.txt', type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--bert_config_file", default='uncased_L-12_H-768_A-12/bert_config.json', type=str, required=True, help= "The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument( "--init_checkpoint", default='uncased_L-12_H-768_A-12/pytorch_model.bin', type=str, required=True, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--tokenize_method", default='word_split', type=str, required=True, choices=["prefix_match", "unk_replace", "word_split"], help= "how to solve the unknow words, max prefix match or replace with [UNK] or split to some words" ) parser.add_argument("--use_crf", default=True, required=True, action='store_true', help="Whether to use CRF after Bert sequence_output") ## Other parameters parser.add_argument("--eval_test", default=True, action='store_true', help="Whether to run eval on the test set.") parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_batch_size", default=24, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=30.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) processor = Semeval_Processor() label_list = processor.get_labels() ner_label_list = processor.get_ner_labels( args.data_dir) # BIO or TO tags for ner entity tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, tokenize_method=args.tokenize_method, do_lower_case=args.do_lower_case) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) # training set train_examples = None num_train_steps = None train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) train_features, _, _ = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, ner_label_list, args.tokenize_method) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_ner_label_ids = torch.tensor([f.ner_label_ids for f in train_features], dtype=torch.long) all_ner_mask = torch.tensor([f.ner_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_ner_label_ids, all_ner_mask) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # test set if args.eval_test: test_examples = processor.get_test_examples(args.data_dir) test_features, test_tokens, all_b_tokens = convert_examples_to_features( test_examples, label_list, args.max_seq_length, tokenizer, ner_label_list, args.tokenize_method) print("test_tokens") print(test_tokens) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) all_ner_label_ids = torch.tensor( [f.ner_label_ids for f in test_features], dtype=torch.long) all_ner_mask = torch.tensor([f.ner_mask for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_ner_label_ids, all_ner_mask) test_dataloader = DataLoader(test_data, batch_size=args.eval_batch_size, shuffle=False) # model and optimizer if args.use_crf: model = BertForTABSAJoint_CRF(bert_config, len(label_list), len(ner_label_list)) else: model = BertForTABSAJoint(bert_config, len(label_list), len(ner_label_list), args.max_seq_length) if args.init_checkpoint is not None: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) # train output_log_file = os.path.join(args.output_dir, "log.txt") print("output_log_file=", output_log_file) with open(output_log_file, "w") as writer: if args.eval_test: writer.write( "epoch\tglobal_step\tloss\ttest_loss\ttest_accuracy\n") else: writer.write("epoch\tglobal_step\tloss\n") global_step = 0 epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch += 1 model.train() tr_loss = 0 tr_ner_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, ner_label_ids, ner_mask = batch loss, ner_loss, _, _ = model(input_ids, segment_ids, input_mask, label_ids, ner_label_ids, ner_mask) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. ner_loss = ner_loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps ner_loss = ner_loss / args.gradient_accumulation_steps loss.backward(retain_graph=True) ner_loss.backward() tr_loss += loss.item() tr_ner_loss += ner_loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 torch.save(model.state_dict(), os.path.join(args.output_dir, f'model_ep_{epoch}.bin')) # eval_test if args.eval_test: model.eval() test_loss, test_accuracy = 0, 0 ner_test_loss = 0 nb_test_steps, nb_test_examples = 0, 0 with open( os.path.join(args.output_dir, "test_ep_" + str(epoch) + ".txt"), "w") as f_test: f_test.write( 'yes_not\tyes_not_pre\tsentence\ttrue_ner\tpredict_ner\n') batch_index = 0 for input_ids, input_mask, segment_ids, label_ids, ner_label_ids, ner_mask in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) ner_label_ids = ner_label_ids.to(device) ner_mask = ner_mask.to(device) # test_tokens is the origin word in sentences [batch_size, sequence_length] ner_test_tokens = test_tokens[batch_index * args.eval_batch_size: (batch_index + 1) * args.eval_batch_size] ner_b_tokens = all_b_tokens[batch_index * args.eval_batch_size: (batch_index + 1) * args.eval_batch_size] batch_index += 1 with torch.no_grad(): tmp_test_loss, tmp_ner_test_loss, logits, ner_predict = model( input_ids, segment_ids, input_mask, label_ids, ner_label_ids, ner_mask) # category & polarity logits = F.softmax(logits, dim=-1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) if args.use_crf: # CRF ner_logits = ner_predict else: # softmax ner_logits = torch.argmax(F.log_softmax(ner_predict, dim=2), dim=2) ner_logits = ner_logits.detach().cpu().numpy() ner_label_ids = ner_label_ids.to('cpu').numpy() ner_mask = ner_mask.to('cpu').numpy() print("outputs") print(outputs) print("ner_label_list") print(ner_label_list) print("ner_test_tokens") print(ner_test_tokens) print("ner_b_tokens") print(ner_b_tokens) print("label_ids") print(label_ids) print("ner_logits") print(ner_logits) for output_i in range(len(outputs)): # category & polarity f_test.write(str(label_ids[output_i])) f_test.write('\t') f_test.write(str(outputs[output_i])) f_test.write('\t') # sentence & ner labels sentence_clean = [] aspect_sentiment_clean = [] label_true = [] label_pre = [] sentence_len = len(ner_test_tokens[output_i]) aspect_sentiment_len = len(ner_b_tokens[output_i]) for i in range(aspect_sentiment_len): if not ner_b_tokens[output_i][i].startswith('##'): aspect_sentiment_clean.append( ner_b_tokens[output_i][i]) for i in range(sentence_len): if not ner_test_tokens[output_i][i].startswith( '##'): sentence_clean.append( ner_test_tokens[output_i][i]) label_true.append( ner_label_list[ner_label_ids[output_i][i]]) label_pre.append( ner_label_list[ner_logits[output_i][i]]) f_test.write(' '.join(sentence_clean)) f_test.write('\t') f_test.write(' '.join(aspect_sentiment_clean)) f_test.write('\t') f_test.write(' '.join(label_true)) f_test.write("\t") f_test.write(' '.join(label_pre)) f_test.write("\n") tmp_test_accuracy = np.sum(outputs == label_ids) test_loss += tmp_test_loss.mean().item() ner_test_loss += tmp_ner_test_loss.mean().item() test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) nb_test_steps += 1 test_loss = test_loss / nb_test_steps ner_test_loss = ner_test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples result = collections.OrderedDict() if args.eval_test: result = { 'epoch': epoch, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'test_loss': test_loss, 'ner_test_loss': ner_test_loss, 'test_accuracy': test_accuracy } else: result = { 'epoch': epoch, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'ner_loss': tr_ner_loss / nb_tr_steps } logger.info("***** Eval results *****") with open(output_log_file, "a+") as writer: for key in result.keys(): logger.info(" %s = %s\n", key, str(result[key])) writer.write("%s\t" % (str(result[key]))) writer.write("\n")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--gpu_ids", type=str, default="0", help="select one gpu to use") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--optimize_on_cpu', default=False,action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=128, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') ################## atec parser.add_argument('--do_submit', default=False, action='store_true', help="submit to results to atec cloud") parser.add_argument('--train_devset', default=False, action='store_true', help="") parser.add_argument("--test_in_file", default=None, type=str, help="") parser.add_argument("--test_out_file", default=None, type=str, help="") args = parser.parse_args() if args.local_rank == -1 and not args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") #device = torch.device("cuda", args.gpu_id) n_gpu = len(args.gpu_ids.split(','))#torch.cuda.device_count() elif args.no_cuda: device = torch.device('cpu') n_gpu = 0 else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info("16-bits training currently not supported in distributed training") args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) #if not args.do_train and not args.do_eval: # raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) #if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) #, exist_ok=True) processor = AtecProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) if args.train_devset: eval_examples = processor.get_dev_examples(args.data_dir) train_examples += eval_examples num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model logger.info('build model') model = BertForSequenceClassification(bert_config, len(label_list)) #model = BertSiameseModel(bert_config, len(label_list)) if args.init_checkpoint is not None: try: # just model.bert model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) except RuntimeError as e: # all model import re new_state_dict = collections.OrderedDict() state_dict = torch.load(args.init_checkpoint, map_location='cpu') for key in state_dict.keys(): new_key = re.sub("module\.", "", key) new_state_dict[new_key] = state_dict[key] model.load_state_dict(new_state_dict) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=[int(x) for x in args.gpu_ids.split(',')]) global_step = 0 tr_loss=None if args.do_train: # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) from tqdm import tqdm, trange from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) # train_features = convert_examples_to_siamese_features( # train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # all_tokens_a = torch.tensor([f.tokens_a for f in train_features], dtype=torch.long) # all_types_a = torch.tensor([f.types_a for f in train_features], dtype=torch.long) # all_mask_a = torch.tensor([f.mask_a for f in train_features], dtype=torch.long) # all_tokens_b = torch.tensor([f.tokens_b for f in train_features], dtype=torch.long) # all_types_b = torch.tensor([f.types_b for f in train_features], dtype=torch.long) # all_mask_b = torch.tensor([f.mask_b for f in train_features], dtype=torch.long) # all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) # train_data = TensorDataset(all_tokens_a, all_types_a, all_mask_a, # all_tokens_b, all_types_b, all_mask_b, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(train_dataloader, desc="Iteration") as pbar: for step, batch in enumerate(pbar): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, logits = model(input_ids, segment_ids, input_mask, label_ids) # tokens_a, types_a, mask_a, tokens_b, types_b, mask_b, label_ids = batch # loss, logits = model(tokens_a, types_a, mask_a, tokens_b, types_b, # mask_b, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() preds = th.argmax(logits, dim=1) acc = accuracy_score(label_ids.detach().cpu().numpy(), preds.detach().cpu().numpy()) pbar.set_postfix(loss=loss.item(), acc=acc.item()) #nb_tr_examples += input_ids.size(0) nb_tr_examples += label_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 torch.save(model.state_dict(), os.path.join(args.output_dir, "pytorch_model.bin")) if args.do_eval: from tqdm import tqdm, trange from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) # eval_features = convert_examples_to_siamese_features( # eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # all_tokens_a = torch.tensor([f.tokens_a for f in eval_features], dtype=torch.long) # all_types_a = torch.tensor([f.types_a for f in eval_features], dtype=torch.long) # all_mask_a = torch.tensor([f.mask_a for f in eval_features], dtype=torch.long) # all_tokens_b = torch.tensor([f.tokens_b for f in eval_features], dtype=torch.long) # all_types_b = torch.tensor([f.types_b for f in eval_features], dtype=torch.long) # all_mask_b = torch.tensor([f.mask_b for f in eval_features], dtype=torch.long) # all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) # eval_data = TensorDataset(all_tokens_a, all_types_a, all_mask_a, # all_tokens_b, all_types_b, all_mask_b, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] for batch in tqdm(eval_dataloader): # tokens_a, types_a, mask_a, tokens_b, types_b, mask_b, label_ids = batch input_ids, input_mask, segment_ids, label_ids = batch input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) # tmp_eval_loss, logits = model(tokens_a, types_a, mask_a, # tokens_b, types_b, mask_b, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() preds = np.argmax(logits, axis=1) y_true.extend(label_ids) y_pred.extend(preds) tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy #nb_eval_examples += input_ids.size(0) nb_eval_examples += label_ids.size # np.array nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'precision': precision_score(y_true, y_pred), 'recall': recall_score(y_true, y_pred), 'f1': f1_score(y_true, y_pred)} if tr_loss is not None: result['loss'] = tr_loss/nb_tr_steps output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") writer.write("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_submit: eval_examples = processor.get_test_examples(args.test_in_file) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) # eval_features = convert_examples_to_siamese_features( # eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) #eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # all_tokens_a = torch.tensor([f.tokens_a for f in eval_features], dtype=torch.long) # all_types_a = torch.tensor([f.types_a for f in eval_features], dtype=torch.long) # all_mask_a = torch.tensor([f.mask_a for f in eval_features], dtype=torch.long) # all_tokens_b = torch.tensor([f.tokens_b for f in eval_features], dtype=torch.long) # all_types_b = torch.tensor([f.types_b for f in eval_features], dtype=torch.long) # all_mask_b = torch.tensor([f.mask_b for f in eval_features], dtype=torch.long) # all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) # eval_data = TensorDataset(all_tokens_a, all_types_a, all_mask_a, # all_tokens_b, all_types_b, all_mask_b, all_label_ids) #eval_sampler = SequentialSampler(eval_data) #eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() y_pred = [] batch_size = args.eval_batch_size for i in range(0, len(all_label_ids), batch_size): input_ids = all_input_ids[i:i+batch_size] input_mask = all_input_mask[i:i+batch_size] segment_ids = all_segment_ids[i:i+batch_size] label_ids = all_label_ids[i:i+batch_size] # tokens_a = all_tokens_a[i:i+batch_size] # types_a = all_types_a[i:i+batch_size] # mask_a = all_mask_a[i:i+batch_size] # tokens_b = all_tokens_b[i:i+batch_size] # types_b = all_types_b[i:i+batch_size] # mask_b = all_mask_b[i:i+batch_size] with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) # logits = model(tokens_a, types_a, mask_a, # tokens_b, types_b, mask_b) logits = logits.detach().cpu().numpy() preds = np.argmax(logits, axis=1) y_pred.extend(preds) with open(args.test_out_file, "w") as writer: for i, y in enumerate(y_pred): writer.write('{}\t{}\n'.format(i+1, y))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--discr", default=False, action='store_true', help="Whether to do discriminative fine-tuning.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--accumulate_gradients", type=int, default=1, help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") args = parser.parse_args() processors = { "all":AllProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) model = BertForSequenceClassification(bert_config, len(label_list)) if args.init_checkpoint is not None: model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) # torch.save(model.bert.state_dict(),"pytorch_model.bin") model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] if args.discr: group1=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.'] group2=['layer.6.','layer.7.','layer.8.','layer.9.','layer.10.'] group3=['layer.11.'] group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay_rate': 0.01, 'lr': args.learning_rate/1.5}, {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay_rate': 0.01, 'lr': args.learning_rate}, {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay_rate': 0.01, 'lr': args.learning_rate*1.5}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay_rate': 0.0}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay_rate': 0.0, 'lr': args.learning_rate/1.5}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay_rate': 0.0, 'lr': args.learning_rate}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay_rate': 0.0, 'lr': args.learning_rate*1.5}, ] else: optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 eval_examples = processor.get_dev_examples(args.data_dir) eval_features_1, eval_features_2, eval_features_3, eval_features_4 = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features_1], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features_1], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features_1], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features_1], dtype=torch.long) all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in eval_features_1], dtype=torch.long) eval_data_1 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids) eval_dataloader_1 = DataLoader(eval_data_1, batch_size=args.eval_batch_size, shuffle=False) all_input_ids = torch.tensor([f.input_ids for f in eval_features_2], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features_2], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features_2], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features_2], dtype=torch.long) all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in eval_features_2], dtype=torch.long) eval_data_2 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids) eval_dataloader_2 = DataLoader(eval_data_2, batch_size=args.eval_batch_size, shuffle=False) all_input_ids = torch.tensor([f.input_ids for f in eval_features_3], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features_3], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features_3], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features_3], dtype=torch.long) all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in eval_features_3], dtype=torch.long) eval_data_3 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids) eval_dataloader_3 = DataLoader(eval_data_3, batch_size=args.eval_batch_size, shuffle=False) all_input_ids = torch.tensor([f.input_ids for f in eval_features_4], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features_4], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features_4], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features_4], dtype=torch.long) all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in eval_features_4], dtype=torch.long) eval_data_4 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids) eval_dataloader_4 = DataLoader(eval_data_4, batch_size=args.eval_batch_size, shuffle=False) if args.do_train: train_features_1, train_features_2, train_features_3, train_features_4 = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features_1], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features_1], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features_1], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features_1], dtype=torch.long) all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in train_features_1], dtype=torch.long) train_data_1 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids) if args.local_rank == -1: train_sampler_1 = RandomSampler(train_data_1) else: train_sampler_1 = DistributedSampler(train_data_1) train_dataloader_1 = DataLoader(train_data_1, sampler=train_sampler_1, batch_size=args.train_batch_size) all_input_ids = torch.tensor([f.input_ids for f in train_features_2], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features_2], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features_2], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features_2], dtype=torch.long) all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in train_features_2], dtype=torch.long) train_data_2 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids) if args.local_rank == -1: train_sampler_2 = RandomSampler(train_data_2) else: train_sampler_2 = DistributedSampler(train_data_2) train_dataloader_2 = DataLoader(train_data_2, sampler=train_sampler_2, batch_size=args.train_batch_size) all_input_ids = torch.tensor([f.input_ids for f in train_features_3], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features_3], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features_3], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features_3], dtype=torch.long) all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in train_features_3], dtype=torch.long) train_data_3 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids) if args.local_rank == -1: train_sampler_3 = RandomSampler(train_data_3) else: train_sampler_3 = DistributedSampler(train_data_3) train_dataloader_3 = DataLoader(train_data_3, sampler=train_sampler_3, batch_size=args.train_batch_size) all_input_ids = torch.tensor([f.input_ids for f in train_features_4], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features_4], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features_4], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features_4], dtype=torch.long) all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in train_features_4], dtype=torch.long) train_data_4 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids) if args.local_rank == -1: train_sampler_4 = RandomSampler(train_data_4) else: train_sampler_4 = DistributedSampler(train_data_4) train_dataloader_4 = DataLoader(train_data_4, sampler=train_sampler_4, batch_size=args.train_batch_size) print("len(train_features_1)=",len(train_features_1)) print("len(train_features_2)=",len(train_features_2)) print("len(train_features_3)=",len(train_features_3)) print("len(train_features_4)=",len(train_features_4)) a=[] for i in range(int(len(train_features_1)/args.train_batch_size)): a.append(1) for i in range(int(len(train_features_2)/args.train_batch_size)): a.append(2) for i in range(int(len(train_features_3)/args.train_batch_size)): a.append(3) for i in range(int(len(train_features_4)/args.train_batch_size)): a.append(4) print("len(a)=",len(a)) random.shuffle(a) print("a[:20]=",a[:20]) epoch=0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): random.shuffle(a) print("a[:20]=",a[:20]) epoch+=1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, number in enumerate((tqdm(a, desc="Iteration"))): if number==1:batch=train_dataloader_1.__iter__().__next__() if number==2:batch=train_dataloader_2.__iter__().__next__() if number==3:batch=train_dataloader_3.__iter__().__next__() if number==4:batch=train_dataloader_4.__iter__().__next__() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, dataset_label_id = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids, dataset_label_id) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 torch.save(model.module.bert.state_dict(), os.path.join(args.output_dir,'pytorch_model'+str(epoch)+'.bin')) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open(os.path.join(args.output_dir, "results_imdb_ep_"+str(epoch)+".txt"),"w") as f: for input_ids, input_mask, segment_ids, label_ids, dataset_label_id in eval_dataloader_1: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, dataset_label_id) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output in outputs: f.write(str(output)+"\n") tmp_eval_accuracy=np.sum(outputs == label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples eval_loss_imdb = eval_loss eval_accuracy_imdb = eval_accuracy eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open(os.path.join(args.output_dir, "results_yelp_p_ep_"+str(epoch)+".txt"),"w") as f: for input_ids, input_mask, segment_ids, label_ids, dataset_label_id in eval_dataloader_2: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, dataset_label_id) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output in outputs: f.write(str(output)+"\n") tmp_eval_accuracy=np.sum(outputs == label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples eval_loss_yelp_p = eval_loss eval_accuracy_yelp_p = eval_accuracy eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open(os.path.join(args.output_dir, "results_ag_ep_"+str(epoch)+".txt"),"w") as f: for input_ids, input_mask, segment_ids, label_ids, dataset_label_id in eval_dataloader_3: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, dataset_label_id) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output in outputs: f.write(str(output)+"\n") tmp_eval_accuracy=np.sum(outputs == label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples eval_loss_ag = eval_loss eval_accuracy_ag = eval_accuracy eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open(os.path.join(args.output_dir, "results_dbpedia_ep_"+str(epoch)+".txt"),"w") as f: for input_ids, input_mask, segment_ids, label_ids, dataset_label_id in eval_dataloader_4: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, dataset_label_id) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output in outputs: f.write(str(output)+"\n") tmp_eval_accuracy=np.sum(outputs == label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples eval_loss_dbpedia = eval_loss eval_accuracy_dbpedia = eval_accuracy result = {'eval_loss_imdb': eval_loss_imdb, 'eval_accuracy_imdb': eval_accuracy_imdb, 'eval_loss_yelp_p': eval_loss_yelp_p, 'eval_accuracy_yelp_p': eval_accuracy_yelp_p, 'eval_loss_ag': eval_loss_ag, 'eval_accuracy_ag': eval_accuracy_ag, 'eval_loss_dbpedia': eval_loss_dbpedia, 'eval_accuracy_dbpedia': eval_accuracy_dbpedia, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} output_eval_file = os.path.join(args.output_dir, "eval_results_ep_"+str(epoch)+".txt") print("output_eval_file=",output_eval_file) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, choices=["sentihood_single", "sentihood_NLI_M", "sentihood_QA_M", \ "sentihood_NLI_B", "sentihood_QA_B", "semeval_single", \ "semeval_NLI_M", "semeval_QA_M", "semeval_NLI_B", "semeval_QA_B"], help="The name of the task to train.") parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help= "The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--init_checkpoint", default=None, type=str, required=True, help="Initial checkpoint (usually from a pre-trained BERT model).") ## Other parameters parser.add_argument("--eval_test", default=False, action='store_true', help="Whether to run eval on the test set.") parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=4, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) args = parser.parse_args() #--task_name sentihood_NLI_M \ #--data_dir data/sentihood/bert-pair/ \ #--vocab_file uncased_L-12_H-768_A-12/vocab.txt \ #--bert_config_file uncased_L-12_H-768_A-12/bert_config.json \ #--init_checkpoint uncased_L-12_H-768_A-12/pytorch_model.bin \ #--eval_test \ #--do_lower_case \ #--max_seq_length 128 \ #--train_batch_size 2 \ #--learning_rate 2e-5 \ #--num_train_epochs 6.0 \ #--output_dir senti_results/sentihood/NLI_M \ #--seed 42 if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) #bert_config = BertConfig.from_json_file(args.bert_config_file) #if args.max_seq_length > bert_config.max_position_embeddings: # raise ValueError( # "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( # args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) # prepare dataloaders processors = { "sentihood_single": Sentihood_single_Processor, "sentihood_NLI_M": Sentihood_NLI_M_Processor, "sentihood_QA_M": Sentihood_QA_M_Processor, "sentihood_NLI_B": Sentihood_NLI_B_Processor, "sentihood_QA_B": Sentihood_QA_B_Processor, "semeval_single": Semeval_single_Processor, "semeval_NLI_M": Semeval_NLI_M_Processor, "semeval_QA_M": Semeval_QA_M_Processor, "semeval_NLI_B": Semeval_NLI_B_Processor, "semeval_QA_B": Semeval_QA_B_Processor, } processor = processors[args.task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) # training set train_examples = None num_train_steps = None train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) train_features = convert_examples_to_features_sentihood( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) #乘完了iteration的次数 all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all__detection_label_ids = torch.tensor( [f.label_exist_id for f in train_features], dtype=torch.long) all_aspects = torch.tensor([f.aspect for f in train_features], dtype=torch.long) all_pos = torch.tensor([f.my_pos for f in train_features], dtype=torch.long) all_dis = torch.tensor([f.dis for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all__detection_label_ids, all_aspects, all_pos, all_dis) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # test set if args.eval_test: test_examples = processor.get_test_examples(args.data_dir) test_features = convert_examples_to_features_sentihood( test_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) all_detection_label_ids = torch.tensor( [f.label_exist_id for f in test_features], dtype=torch.long) all_aspects = torch.tensor([f.aspect for f in test_features], dtype=torch.long) all_pos = torch.tensor([f.my_pos for f in test_features], dtype=torch.long) all_dis = torch.tensor([f.dis for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_detection_label_ids, all_aspects, all_pos, all_dis) test_dataloader = DataLoader(test_data, batch_size=args.eval_batch_size, shuffle=False) # model and optimizer model = BertForSequenceClassification(len(label_list)) # model = torch.load("model_data/attention_add_model") #if args.init_checkpoint is not None: # model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ { 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, #xiaobuxing { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 } ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) #optimizer = transformers.AdamW(optimizer_parameters) # train output_log_file = os.path.join(args.output_dir, "log.txt") print("output_log_file=", output_log_file) with open(output_log_file, "w") as writer: if args.eval_test: writer.write( "epoch\tglobal_step\tloss\ttest_loss\ttest_accuracy\n") else: writer.write("epoch\tglobal_step\tloss\n") global_step = 0 epoch = 0 # model = torch.load("model_data/attention_add_model6") #40 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch += 1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) # print() input_ids, input_mask, segment_ids, label_ids, exist_ids, all_aspects, pos, dis = batch loss, _, _ = model(input_ids, segment_ids, input_mask, label_ids, exist_ids, all_aspects, pos, dis) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 # break torch.save(model, "model_data/attention_add_model" + str(epoch)) #eval_test if args.eval_test: model.eval() test_loss, test_accuracy, senti_test_accuracy = 0, 0, 0 nb_test_steps, nb_test_examples, senti_nb_test_examples = 0, 0, 0 with open( os.path.join(args.output_dir, "test_ep_" + str(epoch) + ".txt"), "w") as f_test: for input_ids, input_mask, segment_ids, label_ids, exist_ids, all_aspects, pos, dis in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) exist_ids = exist_ids.to(device) all_aspects = all_aspects.to(device) dis = dis.to(device) pos = pos.to(device) with torch.no_grad(): tmp_test_loss, detect_logits, sent_logits = model( input_ids, segment_ids, input_mask, label_ids, exist_ids, all_aspects, pos, dis) detect_logits = F.softmax(detect_logits, dim=-1) detect_logits = detect_logits.detach().cpu().numpy() exist_ids = exist_ids.to('cpu').numpy() outputs = np.argmax(detect_logits, axis=1) for output_i in range(len(outputs)): f_test.write(str(outputs[output_i])) for ou in detect_logits[output_i]: f_test.write(" " + str(ou)) f_test.write("\n") tmp_test_accuracy = np.sum(outputs == exist_ids) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_accuracy sent_logits = F.softmax(sent_logits, dim=-1) sent_logits = sent_logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() senti_outputs = np.argmax(sent_logits, axis=1) for output_i in range(len(senti_outputs)): f_test.write(str(senti_outputs[output_i])) for ou in sent_logits[output_i]: f_test.write(" " + str(ou)) f_test.write("\n") senti_tmp_test_accuracy = np.sum( senti_outputs == label_ids) senti_test_accuracy += senti_tmp_test_accuracy nb_test_examples += input_ids.size(0) senti_nb_test_examples += np.sum(exist_ids) nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples senti_accuracy = senti_test_accuracy / senti_nb_test_examples result = collections.OrderedDict() if args.eval_test: result = { 'epoch': epoch, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'test_loss': test_loss, 'test_accuracy': test_accuracy, 'senti_test_accuracy': senti_accuracy, } else: result = { 'epoch': epoch, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } logger.info("***** Eval results *****") with open(output_log_file, "a+") as writer: for key in result.keys(): logger.info(" %s = %s\n", key, str(result[key])) writer.write("%s\t" % (str(result[key]))) writer.write("\n")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument('--mode', type=str, default='train') parser.add_argument('--pause', type=int, default=0) parser.add_argument('--iteration', type=str, default='1') parser.add_argument('--fs', type=str, default='local', help='must be `local`. Do not change.') # Data paths parser.add_argument('--data_dir', default='data/', type=str) parser.add_argument("--train_file", default='train-v1.1.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default='dev-v1.1.json', type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument('--gt_file', default='dev-v1.1.json', type=str, help='ground truth file needed for evaluation.') # Metadata paths parser.add_argument('--metadata_dir', default='metadata/', type=str) parser.add_argument("--vocab_file", default='vocab.txt', type=str, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--bert_model_option", default='large_uncased', type=str, help="model architecture option. [large_uncased] or [base_uncased]") parser.add_argument("--bert_config_file", default='bert_config.json', type=str, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--init_checkpoint", default='pytorch_model.bin', type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") # Output and load paths parser.add_argument("--output_dir", default='out/', type=str, help="The output directory where the model checkpoints will be written.") parser.add_argument("--index_file", default='index.hdf5', type=str, help="index output file.") parser.add_argument("--question_emb_file", default='question.hdf5', type=str, help="question output file.") parser.add_argument('--load_dir', default='out/', type=str) # Local paths (if we want to run cmd) parser.add_argument('--eval_script', default='evaluate-v1.1.py', type=str) # Do's parser.add_argument("--do_load", default=False, action='store_true', help='Do load. If eval, do load automatically') parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_train_filter", default=False, action='store_true', help='Train filter or not.') parser.add_argument("--do_train_sparse", default=False, action='store_true', help='Train sparse or not.') parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument('--do_eval', default=False, action='store_true') parser.add_argument('--do_embed_question', default=False, action='store_true') parser.add_argument('--do_index', default=False, action='store_true') parser.add_argument('--do_serve', default=False, action='store_true') # Model options: if you change these, you need to train again parser.add_argument("--do_case", default=False, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument('--phrase_size', default=961, type=int) parser.add_argument('--metric', default='ip', type=str, help='ip | l2') parser.add_argument("--use_sparse", default=False, action='store_true') # GPU and memory related options parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--train_batch_size", default=12, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=16, type=int, help="Total batch size for predictions.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") # Training options: only effective during training parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--num_train_filter_epochs", default=1.0, type=float, help="Total number of training epochs for filter to perform.") parser.add_argument("--num_train_sparse_epochs", default=3.0, type=float, help="Total number of training epochs for sparse to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") # Prediction options: only effective during prediction parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") # Index Options parser.add_argument('--dtype', default='float32', type=str) parser.add_argument('--filter_threshold', default=-1e9, type=float) parser.add_argument('--compression_offset', default=-2, type=float) parser.add_argument('--compression_scale', default=20, type=float) parser.add_argument('--split_by_para', default=False, action='store_true') # Serve Options parser.add_argument('--port', default=9009, type=int) # Others parser.add_argument('--parallel', default=False, action='store_true') parser.add_argument("--verbose_logging", default=False, action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--draft', default=False, action='store_true') parser.add_argument('--draft_num_examples', type=int, default=12) args = parser.parse_args() # Filesystem routines if args.fs == 'local': class Processor(object): def __init__(self, path): self._save = None self._load = None self._path = path def bind(self, save, load): self._save = save self._load = load def save(self, checkpoint=None, save_fn=None, **kwargs): path = os.path.join(self._path, str(checkpoint)) if save_fn is None: self._save(path, **kwargs) else: save_fn(path, **kwargs) def load(self, checkpoint, load_fn=None, session=None, **kwargs): assert self._path == session path = os.path.join(self._path, str(checkpoint), 'model.pt') if load_fn is None: self._load(path, **kwargs) else: load_fn(path, **kwargs) processor = Processor(args.load_dir) else: raise ValueError(args.fs) if not args.do_train: args.do_load = True # Configure paths args.train_file = os.path.join(args.data_dir, args.train_file) args.predict_file = os.path.join(args.data_dir, args.predict_file) args.gt_file = os.path.join(args.data_dir, args.gt_file) args.bert_config_file = os.path.join(args.metadata_dir, args.bert_config_file.replace(".json", "") + "_" + args.bert_model_option + ".json") args.init_checkpoint = os.path.join(args.metadata_dir, args.init_checkpoint.replace(".bin", "") + "_" + args.bert_model_option + ".bin") args.vocab_file = os.path.join(args.metadata_dir, args.vocab_file) args.index_file = os.path.join(args.output_dir, args.index_file) # Multi-GPU stuff if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) # Seed for reproducibility random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory () already exists and is not empty.") pass else: os.makedirs(args.output_dir, exist_ok=True) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=not args.do_case) model = BertPhraseModel( bert_config, phrase_size=args.phrase_size, metric=args.metric, use_sparse=args.use_sparse ) print('Number of model parameters:', sum(p.numel() for p in model.parameters())) if not args.do_load and args.init_checkpoint is not None: state_dict = torch.load(args.init_checkpoint, map_location='cpu') # If below: for Korean BERT compatibility if next(iter(state_dict)).startswith('bert.'): state_dict = {key[len('bert.'):]: val for key, val in state_dict.items()} state_dict = {key: val for key, val in state_dict.items() if key in model.encoder.bert_model.state_dict()} model.encoder.bert.load_state_dict(state_dict) if args.fp16: model.half() if not args.optimize_on_cpu: model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif args.parallel or n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_load: bind_model(processor, model) processor.load(args.iteration, session=args.load_dir) if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, draft=args.draft, draft_num_examples=args.draft_num_examples) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) bind_model(processor, model, optimizer) global_step = 0 train_features, train_features_ = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) train_features = inject_noise_to_features_list(train_features, clamp=True, replace=True, shuffle=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_input_ids_ = torch.tensor([f.input_ids for f in train_features_], dtype=torch.long) all_input_mask_ = torch.tensor([f.input_mask for f in train_features_], dtype=torch.long) if args.fp16: (all_input_ids, all_input_mask, all_start_positions, all_end_positions) = tuple(t.half() for t in (all_input_ids, all_input_mask, all_start_positions, all_end_positions)) all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_)) train_data = TensorDataset(all_input_ids, all_input_mask, all_input_ids_, all_input_mask_, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch in range(int(args.num_train_epochs)): for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch %d" % (epoch + 1))): batch = tuple(t.to(device) for t in batch) (input_ids, input_mask, input_ids_, input_mask_, start_positions, end_positions) = batch loss, _ = model(input_ids, input_mask, input_ids_, input_mask_, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.optimize_on_cpu: model.to('cpu') optimizer.step() # We have accumulated enought gradients model.zero_grad() if args.optimize_on_cpu: model.to(device) global_step += 1 processor.save(epoch + 1) if args.do_train_filter: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, draft=args.draft, draft_num_examples=args.draft_num_examples) num_train_steps = int( len( train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_filter_epochs) if args.parallel or n_gpu > 1: optimizer = Adam(model.module.filter.parameters()) else: optimizer = Adam(model.filter.parameters()) bind_model(processor, model, optimizer) global_step = 0 train_features, train_features_ = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running filter training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_input_ids_ = torch.tensor([f.input_ids for f in train_features_], dtype=torch.long) all_input_mask_ = torch.tensor([f.input_mask for f in train_features_], dtype=torch.long) if args.fp16: (all_input_ids, all_input_mask, all_start_positions, all_end_positions) = tuple(t.half() for t in (all_input_ids, all_input_mask, all_start_positions, all_end_positions)) all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_)) train_data = TensorDataset(all_input_ids, all_input_mask, all_input_ids_, all_input_mask_, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch in range(int(args.num_train_filter_epochs)): for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch %d" % (epoch + 1))): batch = tuple(t.to(device) for t in batch) (input_ids, input_mask, input_ids_, input_mask_, start_positions, end_positions) = batch _, loss = model(input_ids, input_mask, input_ids_, input_mask_, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.optimize_on_cpu: model.to('cpu') optimizer.step() # We have accumulated enought gradients model.zero_grad() if args.optimize_on_cpu: model.to(device) global_step += 1 processor.save(epoch + 1) if args.do_train_sparse: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, draft=args.draft, draft_num_examples=args.draft_num_examples) num_train_steps = int( len( train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_sparse_epochs) ''' if args.parallel or n_gpu > 1: optimizer = Adam(model.module.sparse_layer.parameters()) else: optimizer = Adam(model.sparse_layer.parameters()) ''' no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if (n not in no_decay) and ('filter' not in n)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if (n in no_decay) and ('filter' not in n)], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) bind_model(processor, model, optimizer) global_step = 0 train_features, train_features_ = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running sparse training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_input_ids_ = torch.tensor([f.input_ids for f in train_features_], dtype=torch.long) all_input_mask_ = torch.tensor([f.input_mask for f in train_features_], dtype=torch.long) if args.fp16: (all_input_ids, all_input_mask, all_start_positions, all_end_positions) = tuple(t.half() for t in (all_input_ids, all_input_mask, all_start_positions, all_end_positions)) all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_)) train_data = TensorDataset(all_input_ids, all_input_mask, all_input_ids_, all_input_mask_, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch in range(int(args.num_train_sparse_epochs)): for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch %d" % (epoch + 1))): batch = tuple(t.to(device) for t in batch) (input_ids, input_mask, input_ids_, input_mask_, start_positions, end_positions) = batch loss, _ = model(input_ids, input_mask, input_ids_, input_mask_, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.optimize_on_cpu: model.to('cpu') optimizer.step() # We have accumulated enought gradients model.zero_grad() if args.optimize_on_cpu: model.to(device) global_step += 1 processor.save(epoch + 1) if args.do_predict: eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, draft=args.draft, draft_num_examples=args.draft_num_examples) eval_features, query_eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_input_ids_ = torch.tensor([f.input_ids for f in query_eval_features], dtype=torch.long) all_input_mask_ = torch.tensor([f.input_mask for f in query_eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) if args.fp16: (all_input_ids, all_input_mask, all_example_index) = tuple(t.half() for t in (all_input_ids, all_input_mask, all_example_index)) all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_)) eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_ids_, all_input_mask_, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() logger.info("Start evaluating") def get_results(): for (input_ids, input_mask, input_ids_, input_mask_, example_indices) in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) input_ids_ = input_ids_.to(device) input_mask_ = input_mask_.to(device) with torch.no_grad(): batch_all_logits, bs, be = model(input_ids, input_mask, input_ids_, input_mask_) for i, example_index in enumerate(example_indices): all_logits = batch_all_logits[i].detach().cpu().numpy() filter_start_logits = bs[i].detach().cpu().numpy() filter_end_logits = be[i].detach().cpu().numpy() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) yield RawResult(unique_id=unique_id, all_logits=all_logits, filter_start_logits=filter_start_logits, filter_end_logits=filter_end_logits) output_prediction_file = os.path.join(args.output_dir, "predictions.json") write_predictions(eval_examples, eval_features, get_results(), args.max_answer_length, not args.do_case, output_prediction_file, args.verbose_logging, args.filter_threshold) if args.do_eval: command = "python %s %s %s" % (args.eval_script, args.gt_file, output_prediction_file) import subprocess process = subprocess.Popen(command.split(), stdout=subprocess.PIPE) output, error = process.communicate() if args.do_embed_question: question_examples = read_squad_examples( question_only=True, input_file=args.predict_file, is_training=False, draft=args.draft, draft_num_examples=args.draft_num_examples) query_eval_features = convert_questions_to_features( examples=question_examples, tokenizer=tokenizer, max_query_length=args.max_query_length) question_dataloader = convert_question_features_to_dataloader(query_eval_features, args.fp16, args.local_rank, args.predict_batch_size) model.eval() logger.info("Start embedding") question_results = get_question_results_(question_examples, query_eval_features, question_dataloader, device, model) path = os.path.join(args.output_dir, args.question_emb_file) print('Writing %s' % path) write_question_results(question_results, query_eval_features, path) if args.do_index: if ':' not in args.predict_file: predict_files = [args.predict_file] offsets = [0] else: dirname = os.path.dirname(args.predict_file) basename = os.path.basename(args.predict_file) start, end = list(map(int, basename.split(':'))) # skip files if possible if os.path.exists(args.index_file): with h5py.File(args.index_file, 'r') as f: dids = list(map(int, f.keys())) start = int(max(dids) / 1000) print('%s exists; starting from %d' % (args.index_file, start)) names = [str(i).zfill(4) for i in range(start, end)] predict_files = [os.path.join(dirname, name) for name in names] offsets = [int(each) * 1000 for each in names] for offset, predict_file in zip(offsets, predict_files): try: context_examples = read_squad_examples( context_only=True, input_file=predict_file, is_training=False, draft=args.draft, draft_num_examples=args.draft_num_examples) for example in context_examples: example.doc_idx += offset context_features = convert_documents_to_features( examples=context_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride) logger.info("***** Running indexing on %s *****" % predict_file) logger.info(" Num orig examples = %d", len(context_examples)) logger.info(" Num split examples = %d", len(context_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in context_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in context_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) if args.fp16: all_input_ids, all_input_mask, all_example_index = tuple( t.half() for t in (all_input_ids, all_input_mask, all_example_index)) context_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: context_sampler = SequentialSampler(context_data) else: context_sampler = DistributedSampler(context_data) context_dataloader = DataLoader(context_data, sampler=context_sampler, batch_size=args.predict_batch_size) model.eval() logger.info("Start indexing") def get_context_results(): for (input_ids, input_mask, example_indices) in context_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) with torch.no_grad(): batch_start, batch_end, batch_span_logits, bs, be, batch_sparse = model(input_ids, input_mask) for i, example_index in enumerate(example_indices): start = batch_start[i].detach().cpu().numpy().astype(args.dtype) end = batch_end[i].detach().cpu().numpy().astype(args.dtype) sparse = None if batch_sparse is not None: sparse = batch_sparse[i].detach().cpu().numpy().astype(args.dtype) span_logits = batch_span_logits[i].detach().cpu().numpy().astype(args.dtype) filter_start_logits = bs[i].detach().cpu().numpy().astype(args.dtype) filter_end_logits = be[i].detach().cpu().numpy().astype(args.dtype) context_feature = context_features[example_index.item()] unique_id = int(context_feature.unique_id) yield ContextResult(unique_id=unique_id, start=start, end=end, span_logits=span_logits, filter_start_logits=filter_start_logits, filter_end_logits=filter_end_logits, sparse=sparse) t0 = time() write_hdf5(context_examples, context_features, get_context_results(), args.max_answer_length, not args.do_case, args.index_file, args.filter_threshold, args.verbose_logging, offset=args.compression_offset, scale=args.compression_scale, split_by_para=args.split_by_para, use_sparse=args.use_sparse) print('%s: %.1f mins' % (predict_file, (time() - t0) / 60)) except Exception as e: with open(os.path.join(args.output_dir, 'error_files.txt'), 'a') as fp: fp.write('error file: %s\n' % predict_file) fp.write('error message: %s\n' % str(e)) if args.do_serve: def get(text): question_examples = [SquadExample(qas_id='serve', question_text=text)] query_eval_features = convert_questions_to_features( examples=question_examples, tokenizer=tokenizer, max_query_length=16) question_dataloader = convert_question_features_to_dataloader(query_eval_features, args.fp16, args.local_rank, args.predict_batch_size) model.eval() question_results = get_question_results_(question_examples, query_eval_features, question_dataloader, device, model) question_result = next(iter(question_results)) out = question_result.start.tolist(), question_result.end.tolist(), question_result.span_logit.tolist() return out serve(get, args.port)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--fine_tune_data_1_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--fine_tune_data_2_dir", default=None, type=str, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--eval_data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help= "The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=False, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--discr", default=False, action='store_true', help="Whether to do discriminative fine-tuning.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument("--frozen_bert", default=False, action='store_true', help="frozen the gradient of bert encoder") parser.add_argument( '--layers', type=int, nargs='+', default=[-2], help="choose the layers that used for downstream tasks, " "-2 means use pooled output, -1 means all layer," "else means the detail layers. default is -2") parser.add_argument('--num_datas', default=None, type=int, help="the number of data examples") parser.add_argument('--num_test_datas', default=None, type=int, help="the number of data examples") parser.add_argument('--pooling_type', default=None, type=str, choices=[None, 'mean', 'max']) parser.add_argument( '--trunc_medium', type=int, default=-2, help="choose the trunc ways, -2 means choose the first seq_len tokens, " "-1 means choose the last seq_len tokens, " "0 means choose the first (seq_len // 2) and the last(seq_len // 2). " "other positive numbers k mean the first k tokens " "and the last (seq_len - k) tokens") args = parser.parse_args() processors = { "ag": AGNewsProcessor, "ag_sep": AGNewsProcessor_sep, "ag_sep_aug": AGNewsProcessor_sep_aug, "binary": BinaryProcessor, "imdb": IMDBProcessor, "imdb_t_m": IMDBProcessor_trunc_medium, "imdb_sep": IMDBProcessor_sep, "imdb_sep_aug": IMDBProcessor_sep_aug, #"yelp_p": Yelp_p_Processor, #"yelp_f": Yelp_f_Processor, #"yahoo": Yahoo_Processor, "trec": Trec_Processor, #"dbpedia":Dbpedia_Processor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) model = BertForSequenceClassification(bert_config, len(label_list), args.layers, pooling=args.pooling_type) if args.init_checkpoint is not None: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) if args.frozen_bert: for p in model.bert.parameters(): p.requires_grad = False model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] if args.discr: group1 = ['layer.0.', 'layer.1.', 'layer.2.', 'layer.3.'] group2 = ['layer.4.', 'layer.5.', 'layer.6.', 'layer.7.'] group3 = ['layer.8.', 'layer.9.', 'layer.10.', 'layer.11.'] group_all = [ 'layer.0.', 'layer.1.', 'layer.2.', 'layer.3.', 'layer.4.', 'layer.5.', 'layer.6.', 'layer.7.', 'layer.8.', 'layer.9.', 'layer.10.', 'layer.11.' ] optimizer_parameters = [ { 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not any( nd in n for nd in group_all) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1) ], 'weight_decay_rate': 0.01, 'lr': args.learning_rate / 2.6 }, { 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2) ], 'weight_decay_rate': 0.01, 'lr': args.learning_rate }, { 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3) ], 'weight_decay_rate': 0.01, 'lr': args.learning_rate * 2.6 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all) ], 'weight_decay_rate': 0.0 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1) ], 'weight_decay_rate': 0.0, 'lr': args.learning_rate / 2.6 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2) ], 'weight_decay_rate': 0.0, 'lr': args.learning_rate }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3) ], 'weight_decay_rate': 0.0, 'lr': args.learning_rate * 2.6 }, ] else: optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] data_dir = args.fine_tune_data_1_dir global_step = 0 print("Initiate Training Data 1") train_examples = None num_train_steps = None train_examples = processor.get_train_examples(data_dir, data_num=args.num_datas) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, trunc_medium=args.trunc_medium) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) print("Train 1") epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch += 1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if args.fine_tune_data_2_dir: data_dir = args.fine_tune_data_2_dir print("Initiate Training Data 2") train_examples = None num_train_steps = None train_examples = processor.get_train_examples(data_dir, data_num=args.num_datas) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) optimizer_2 = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, trunc_medium=args.trunc_medium) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader_2 = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) print("Train 2") epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch += 1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader_2, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer_2.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 print("Initiate Eval Data") data_dir = args.eval_data_dir eval_examples = processor.get_dev_examples(data_dir, data_num=args.num_test_datas) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, trunc_medium=args.trunc_medium) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size, shuffle=False) print("Eval Data") epoch = 0 model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open(os.path.join(args.output_dir, "results_data.txt"), "w") as f: for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluate"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output in outputs: f.write(str(output) + "\n") tmp_eval_accuracy = np.sum(outputs == label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy} output_eval_file = os.path.join(args.output_dir, "eval_data_results.txt") print("output_eval_file=", output_eval_file) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--init_checkpoint", default=None, type=str, help="Init from checkpoint") parser.add_argument("--init_full_model", default=None, type=str, help="Initial full model") parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.0, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=2000, type=int, help="How often to save the model checkpoint") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument("--n_best_size", default=3, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=100, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--use_history", default=False, action='store_true', help="Use history features") parser.add_argument("--verbose_logging", default=False, action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=1, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=128, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') parser.add_argument('--dry_run', action='store_true', default=False, help='Don\'t load model, just load data') args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info("16-bits training currently not supported in distributed training") args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print("Warning: output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = bdu.read_coqa_examples( input_file=args.train_file, is_training=True) real_train_example_len = sum(len(ex['questions']) for ex in train_examples) num_train_steps = int( real_train_example_len / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if not args.dry_run: if args.init_full_model is not None: model_state_dict = torch.load(args.init_full_model) model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) else: model = BertForQuestionAnswering(bert_config, use_history=args.use_history) if args.init_checkpoint is not None: model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: train_features = bdu.convert_coqa_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", real_train_example_len) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_f_history = torch.tensor([f.f_history for f in train_features], dtype=torch.uint8) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_f_history) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"): running_loss = [] for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, f_history = batch # Convert to float here. f_history_32 = f_history.float() loss = model(input_ids, segment_ids, input_mask, start_positions=start_positions, end_positions=end_positions, f_history=f_history_32, debug=True) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps running_loss.append(loss.item()) if step % 40 == 0: logger.info("epoch {} step {}: avg loss {}".format(epoch_i, step, sum(running_loss) / len(running_loss))) running_loss = [] loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if global_step % (args.save_checkpoints_steps // args.train_batch_size) == 0: model_name = os.path.join(args.output_dir, 'model-{}.pth'.format(global_step)) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self print("Step {}: saving model to {}".format(global_step, model_name)) torch.save(model_to_save.state_dict(), model_name) model_name = os.path.join(args.output_dir, 'model-{}.pth'.format(global_step)) print("Step {}: saving model to {}".format(global_step, model_name)) model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), model_name) if args.do_predict: eval_examples = bdu.read_coqa_examples( input_file=args.predict_file, is_training=False) eval_features = bdu.convert_coqa_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) all_f_history = torch.tensor([f.f_history for f in eval_features], dtype=torch.uint8) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_f_history) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices, f_history in tqdm(eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) f_history = f_history.to(device) with torch.no_grad(),: f_history_32 = f_history.float() batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask, f_history=f_history_32) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, coqa_id=eval_feature.coqa_id, turn_id=eval_feature.turn_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") bdu.write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(args): if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) bert_config_file = "distilbert_config.json" bert_config = BertConfig.from_json_file(bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # shutil.rmtree(args.output_dir) # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) # prepare dataloaders processor = Sentihood_QA_M_Processor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) # training set num_train_steps = None train_data, len_train = get_data(processor, label_list, tokenizer, args.data_dir, 'train') if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_steps = int(len_train / args.train_batch_size * args.num_train_epochs) logger.info("***** Running training *****") logger.info(" Num Training examples = %d", len_train) logger.info(" Training Batch size = %d", args.train_batch_size) logger.info(" Training Num steps = %d", num_train_steps) # test set if args.eval_test: test_data, len_test = get_data(processor, label_list, tokenizer, args.data_dir, 'test') test_dataloader = DataLoader(test_data, batch_size=args.eval_batch_size, shuffle=False) logger.info(" Num Test examples = %d", len_test) # model and optimizer #model = BertForSequenceClassification(bert_config, len(label_list)) #if args.init_checkpoint is not None: # model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) model = BertBinaryClassifier(len(label_list)) if args.init_checkpoint: logger.info("Loading model from checkpoint %s", args.init_checkpoint) model.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) #optimizer = Adam(model.parameters(), lr=args.learning_rate) # train timestamp = time.strftime('%b-%d-%Y_%H%M', time.localtime()) output_log_file = os.path.join(args.output_dir, "log_{}.txt".format(timestamp)) print("output_log_file=", output_log_file) with open(output_log_file, "w") as writer: if args.eval_test: writer.write( "epoch\tglobal_step\ttrain_loss\ttrain_accuracy\ttest_loss\ttest_accuracy\n" ) else: writer.write("epoch\tglobal_step\ttrain_loss\ttrain_accuracy\n") global_step = 0 for epoch_num in range(int(args.num_train_epochs)): model.train() train_loss, train_accuracy, nb_train_steps, nb_train_examples = 0, 0, 0, 0 for step, batch_data in enumerate( tqdm(train_dataloader, desc="Iteration")): #token_ids, masks, labels = tuple(t.to(device) for t in batch_data) batch = tuple(t.to(device) for t in batch_data) input_ids, input_mask, label_ids = batch outputs = model(input_ids, input_mask, label_ids) #logits is the proba loss, logits = outputs[:2] logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) train_accuracy += np.sum(outputs == label_ids) train_loss += loss.item() loss.backward() nb_train_examples += input_ids.size(0) nb_train_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if (step + 1) % 50 == 0: logger.info( "Epoch = %d, Batch = %d, Batch loss = %f, Avg loss (per batch) = %f", epoch_num + 1, (step + 1), loss.item(), train_loss / (step + 1)) if global_step % 200 == 0: logger.info("Creating a checkpoint.") model.eval().cpu() ckpt_model_filename = "ckpt_epoch_" + str( epoch_num + 1) + "_examples_" + str(global_step) + ".pth" ckpt_model_path = os.path.join(args.output_dir, ckpt_model_filename) torch.save(model.state_dict(), ckpt_model_path) model.to(device) train_loss /= nb_train_steps train_accuracy /= nb_train_examples logger.info( "After %f epoch, Training loss = %f, Training accuracy = %f", epoch_num + 1, train_loss, train_accuracy) # eval_test if args.eval_test: model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 with open( os.path.join(args.output_dir, "test_ep_" + str(epoch_num + 1) + ".txt"), "w") as f_test: for input_ids, input_mask, label_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) label_ids = label_ids.to(device) with torch.no_grad(): outputs = model(input_ids, input_mask, label_ids) tmp_test_loss = outputs[0] logits = outputs[1] logits = logits.detach().cpu().numpy() logits = softmax(logits, axis=1) label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output_i in range(len(outputs)): f_test.write(str(outputs[output_i])) for ou in logits[output_i]: f_test.write(" " + str(ou)) f_test.write("\n") tmp_test_accuracy = np.sum(outputs == label_ids) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples result = collections.OrderedDict() if args.eval_test: result = { 'epoch': epoch_num + 1, 'global_step': global_step, 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'test_loss': test_loss, 'test_accuracy': test_accuracy } else: result = { 'epoch': epoch_num + 1, 'global_step': global_step, 'train_loss': train_loss, 'train_accuracy': train_accuracy } logger.info("***** Eval results *****") with open(output_log_file, "a+") as writer: print("Final Classfier Results: ", result) for key in result.keys(): logger.info(" %s = %s\n", key, str(result[key])) writer.write("%s\t" % (str(result[key]))) writer.write("\n") model.eval().cpu() timestamp = time.strftime('%b-%d-%Y_%H%M', time.localtime()) save_model_filename = "epoch_" + str( args.num_train_epochs) + "_" + timestamp + ".model" save_model_path = os.path.join(args.output_dir, save_model_filename) torch.save(model.state_dict(), save_model_path)
def main(): args = argsInfo() processors = { "ag": AGNewsProcessor, "ag_sep": AGNewsProcessor_sep, "ag_sep_aug": AGNewsProcessor_sep_aug, "imdb": IMDBProcessor, "imdb_sep": IMDBProcessor_sep, "imdb_sep_aug": IMDBProcessor_sep_aug, "yelp_p": Yelp_p_Processor, "yelp_f": Yelp_f_Processor, "yahoo": Yahoo_Processor, "trec": Trec_Processor, "dbpedia": Dbpedia_Processor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) model = BertForSequenceClassification(bert_config, len(label_list)) if args.init_checkpoint is not None: model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] if args.discr: group1 = ['layer.0.', 'layer.1.', 'layer.2.', 'layer.3.'] group2 = ['layer.4.', 'layer.5.', 'layer.6.', 'layer.7.'] group3 = ['layer.8.', 'layer.9.', 'layer.10.', 'layer.11.'] group_all = ['layer.0.', 'layer.1.', 'layer.2.', 'layer.3.', 'layer.4.', 'layer.5.', 'layer.6.', 'layer.7.', 'layer.8.', 'layer.9.', 'layer.10.', 'layer.11.'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], 'weight_decay_rate': 0.01, 'lr': args.learning_rate / 2.6}, {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], 'weight_decay_rate': 0.01, 'lr': args.learning_rate}, {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], 'weight_decay_rate': 0.01, 'lr': args.learning_rate * 2.6}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], 'weight_decay_rate': 0.0}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], 'weight_decay_rate': 0.0, 'lr': args.learning_rate / 2.6}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], 'weight_decay_rate': 0.0, 'lr': args.learning_rate}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], 'weight_decay_rate': 0.0, 'lr': args.learning_rate * 2.6}, ] else: optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size, shuffle=False) if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch += 1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients # print("middle=",optimizer.get_lr()) # print("len(middle)=",len(optimizer.get_lr())) model.zero_grad() global_step += 1 model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open(os.path.join(args.output_dir, "results_ep_" + str(epoch) + ".txt"), "w") as f: for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output in outputs: f.write(str(output) + "\n") tmp_eval_accuracy = np.sum(outputs == label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps} output_eval_file = os.path.join(args.output_dir, "eval_results_ep_" + str(epoch) + ".txt") print("output_eval_file=", output_eval_file) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))