def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--output_dir", default='output', type=str, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument("--checkpoint", default='pretrain_ckpt/bert_small_ckpt.bin', type=str, help="checkpoint") parser.add_argument("--model_config", default='data/bert_small.json', type=str) # Other parameters parser.add_argument("--train_file", default='data/KorQuAD_v1.0_train.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=96, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=8.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O2', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer('data/ko_vocab_32k.txt', max_len=args.max_seq_length, do_basic_tokenize=True) # Prepare model config = Config.from_json_file(args.model_config) model = QuestionAnswering(config) model.bert.load_state_dict(torch.load(args.checkpoint)) num_params = count_parameters(model) logger.info("Total Parameter: %d" % num_params) model.to(device) model = torch.nn.DataParallel(model) cached_train_features_file = args.train_file + '_{0}_{1}_{2}'.format( str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_examples = read_squad_examples(input_file=args.train_file, is_training=True, version_2_with_negative=False) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) num_train_optimization_steps = int( len(train_features) / args.train_batch_size) * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=num_train_optimization_steps * 0.1, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) num_train_step = num_train_optimization_steps input_ids = np.load('input_ids2.npy') input_mask = np.load('input_mask.npy') input_segments = np.load('input_segments.npy') start_prob = np.load('start_prob.npy') end_prob = np.load('end_prob.npy') start_label = np.load('input_start.npy') stop_label = np.load('input_stop.npy') """ for i in range(1000): print(input_ids[i]) print(max(start_prob[i])) print(sum(start_prob[i])) input() """ paragraph = torch.tensor(input_ids.astype( np.int64)).type(dtype=torch.long).cuda() paragraph_mask = torch.tensor(input_mask.astype( np.int64)).type(dtype=torch.long).cuda() paragraph_segments = torch.tensor(input_segments.astype( np.int64)).type(dtype=torch.long).cuda() start_prob = torch.tensor(start_prob.astype( np.float32)).type(dtype=torch.float32).cuda() end_prob = torch.tensor(end_prob.astype( np.float32)).type(dtype=torch.float32).cuda() start_label = torch.tensor(start_label.astype( np.int64)).type(dtype=torch.long).cuda() stop_label = torch.tensor(stop_label.astype( np.int64)).type(dtype=torch.long).cuda() train_data = TensorDataset(paragraph, paragraph_mask, paragraph_segments, start_label, stop_label, start_prob, end_prob) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() global_step = 0 epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): iter_bar = tqdm( train_dataloader, desc="Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)") tr_step, total_loss, mean_loss = 0, 0., 0. for step, batch in enumerate(iter_bar): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, start_probs, end_probs = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions, start_probs, end_probs) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 tr_step += 1 total_loss += loss mean_loss = total_loss / tr_step iter_bar.set_description( "Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" % (global_step, num_train_step, mean_loss, loss.item())) logger.info("** ** * Saving file * ** **") model_checkpoint = "korquad_%d.bin" % (epoch) logger.info(model_checkpoint) output_model_file = os.path.join(args.output_dir, model_checkpoint) if n_gpu > 1: torch.save(model.module.state_dict(), output_model_file) else: torch.save(model.state_dict(), output_model_file) epoch += 1
def train(self): from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') task = self.args.task tb_writer = SummaryWriter(log_dir='./runs/' + task + "/" + current_time + self.args.prefix, comment=self.args.prefix) vocabs, lexical_mapping = self._build_model() train_data = DataLoader(self.args, vocabs, lexical_mapping, self.args.train_data, self.args.batch_size, for_train=True) dev_data = DataLoader(self.args, vocabs, lexical_mapping, self.args.dev_data, self.args.batch_size, for_train=False) test_data = DataLoader(self.args, vocabs, lexical_mapping, self.args.test_data, self.args.batch_size, for_train='Eval') train_data.set_unk_rate(self.args.unk_rate) # WRITE PARAMETERS with open('./' + 'param' + '.txt', 'w') as f: for name, param in self.model.named_parameters(): f.writelines('name:' + name + "\n") f.writelines(str(param)) f.writelines('size:' + str(param.size()) + '\n') no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0. }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] gradient_accumulation_steps = 1 t_total = len( train_data) // gradient_accumulation_steps * self.args.epochs optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=self.args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.args.warmup_steps, t_total=t_total) self.model.zero_grad() set_seed(42, self.args.gpus) batches_acm, loss_acm = 0, 0 # Train! logger.info("***** Running training *****") logger.info(" Task: %s", self.args.task) logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", self.args.epochs) logger.info(" Total optimization steps = %d", t_total) logger.info(" Running Language Model = %s", self.args.lm_model) logger.info(" Running Model = %s", self.args.encoder_type) best_acc = 0 best_model_wts = copy.deepcopy(self.model.state_dict()) total_steps = 0 train_iterator = trange(int(self.args.epochs), desc="Epoch") # initialize the early_stopping object early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) for _ in train_iterator: epoch_iterator = tqdm(train_data, desc="Iteration") running_loss = 0.0 running_corrects = 0 batch_count = self.args.batch_multiplier # Turn on the train mode for step, batch in enumerate(epoch_iterator): self.model.train() batch = move_to_cuda(batch, self.device) logits, labels, ans_ids = self.model(batch, train=True) logits_for_pred = logits.clone().detach() loss = self.criterion(logits, labels) loss_value = loss.item() pred_values, pred_indices = torch.max(logits_for_pred, 1) labels = labels.tolist() pred = pred_indices.tolist() corrects = [i for i, j in zip(labels, pred) if i == j] # Statistics running_loss += loss.item() running_corrects += len(corrects) if batch_count == 0: torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) optimizer.step() scheduler.step() total_steps += 1 optimizer.zero_grad() self.model.zero_grad() batch_count = self.args.batch_multiplier loss_acm += loss_value loss.backward() batch_count -= 1 if (batches_acm % (self.args.batch_multiplier * self.args.batch_size) == 0) & (batches_acm != 0) & (step != 0): logger.info( 'Train Epoch %d, Batch %d, loss %.3f, Accuracy %.3f', _, batches_acm, loss_acm / batches_acm, running_corrects / (self.args.batch_size * step)) tb_writer.add_scalar('Training_loss', loss_acm / batches_acm, batches_acm) tb_writer.add_scalar( 'Training_Accuracy', running_corrects / (self.args.batch_size * step)) torch.cuda.empty_cache() batches_acm += 1 epoch_loss = running_loss / batches_acm epoch_acc = running_corrects / len(train_data) print('{} Loss: {:.4f} Acc: {:.4f}'.format(_, epoch_loss, epoch_acc)) tb_writer.add_scalar('Training_Epoch_loss', epoch_loss, _) tb_writer.add_scalar('Training_Epoch_Accuracy', epoch_acc, _) # Evaluate on Development Set eval_epoch_acc, eval_epoch_loss = self._run_evaluate( dev_data, _, write_answer=False) print('Overall_Dev Acc: {:.4f}'.format(eval_epoch_acc)) tb_writer.add_scalar('Dev_Epoch_Accuracy', eval_epoch_acc, _) ################################## # Evaluate on Test Set test_epoch_acc, test_epoch_loss = self._run_evaluate( test_data, _, write_answer=True) print('Overall_Test Acc: {:.4f}'.format(test_epoch_acc)) tb_writer.add_scalar('Test_Epoch_Accuracy', test_epoch_acc, _) # Save only best accuracy model on dev set if eval_epoch_acc > best_acc: best_acc = eval_epoch_acc best_model_wts = copy.deepcopy(self.model.state_dict()) # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model early_stopping(epoch_acc, self.model) if early_stopping.early_stop: print("Early stopping") break self.model.train() logger.info('Best val Acc: {:4f}'.format(best_acc)) torch.save( { 'args': self.save_args, 'model': best_model_wts }, '%s/epoch%d_batch%d_model_best_%s' % (self.args.ckpt, self.args.epochs, batches_acm, self.args.prefix))
def main(args): if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd logger.info("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() args.data_dir = os.path.join(args.data_dir, args.task_name) args.output_dir = os.path.join(args.output_dir, args.task_name) logger.info("args = %s", args) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: logger.info("Output directory already exists and is not empty.") if not os.path.exists(args.output_dir): try: os.makedirs(args.output_dir) except: logger.info("catch a error") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) # use bert to aug train_examples ori_train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_dev_examples(args.data_dir) test_examples = processor.get_test_examples(args.data_dir) if args.double_ori == 0: num_train_optimization_steps = int( len(ori_train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs else: num_train_optimization_steps = int( len(ori_train_examples) * 2 / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) config_class, tokenizer_class = (RobertaConfig, RobertaTokenizer) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) if args.use_saved == 1: bert_saved_dir = args.ckpt if args.co_training: model_class = RobertaForNSP_co model = model_class.from_pretrained(bert_saved_dir, args=args) elif args.only_bert: model_class = RobertaForSequenceClassification model = model_class.from_pretrained(bert_saved_dir) tokenizer = tokenizer_class.from_pretrained(bert_saved_dir) else: model_class = RobertaForNSPAug model = model_class.from_pretrained(bert_saved_dir, args=args) else: config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=task_name, cache_dir=args.cache_dir if args.cache_dir else None) if args.only_bert: model_class = RobertaForSequenceClassification model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) else: model_class = RobertaForNSPAug model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, args=args) model.cuda() if n_gpu > 1: model = torch.nn.DataParallel(model) cnt = np.sum(np.prod(v.size()) for name, v in model.named_parameters()) / 1e6 logger.info("cnt %s", str(cnt)) if args.do_first_eval: args.do_train = False res_file = os.path.join(args.output_dir, "first_test.tsv") eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts = \ do_evaluate(args, processor, label_list, tokenizer, model, 0, output_mode, num_labels, task_name, eval_examples, type="dev") eval_res.update(res_parts) for key in sorted(eval_res.keys()): logger.info("first evaluation: %s = %s", key, str(eval_res[key])) idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model) dataframe = pd.DataFrame({'index': range(idx), 'prediction': preds}) dataframe.to_csv(res_file, index=False, sep='\t') logger.info(" Num test length = %d", idx) logger.info(" Done ") # write mm test results if task_name == "mnli": res_file = os.path.join(args.output_dir, "first_test_mm.tsv") idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model, do_mm=True) dataframe = pd.DataFrame({ 'index': range(idx), 'prediction': preds }) dataframe.to_csv(res_file, index=False, sep='\t') logger.info(" Num test length = %d", idx) logger.info(" Done write mm") if args.do_train: # Prepare optimizer no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(args.warmup_rate * num_train_optimization_steps), num_training_steps=num_train_optimization_steps) global_step = 0 best_val_acc = 0.0 first_time = time.time() logger.info("***** Running training *****") logger.info(" Num original examples = %d", len(ori_train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() aug_ratio = 0.0 aug_seed = np.random.randint(0, 1000) for epoch in range(int(args.num_train_epochs)): if args.only_bert: train_features = convert_examples_to_features( ori_train_examples, label_list, args.max_seq_length, tokenizer, num_show=args.num_show, output_mode=output_mode, args=args, pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], do_roberta=1) else: logger.info("epoch=%d, aug_ratio = %f, aug_seed=%d", epoch, aug_ratio, aug_seed) train_examples = Aug_each_ckpt( ori_train_examples, label_list, model, tokenizer, args=args, num_show=args.num_show, output_mode=output_mode, seed=aug_seed, aug_ratio=aug_ratio, use_bert=False, do_roberta=1, ssa_roberta=1, pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0]) if aug_ratio + args.aug_ratio_each < 1.0: aug_ratio += args.aug_ratio_each aug_seed += 1 train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, num_show=args.num_show, output_mode=output_mode, args=args, pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], do_roberta=1) logger.info("Done convert features") all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.float) token_real_label = torch.tensor( [f.token_real_label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, token_real_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) logger.info("begin training") tr_loss, tr_seq_loss, tr_aug_loss, train_seq_accuracy, train_aug_accuracy = 0, 0, 0, 0, 0 nb_tr_examples, nb_tr_steps, nb_tr_tokens = 0, 0, 0 preds = [] all_labels = [] for step, batch in enumerate(train_dataloader): batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, label_ids, token_real_label = batch if args.only_bert: outputs = model(input_ids, input_mask) seq_logits = outputs[0] else: seq_logits, aug_logits, aug_loss = model( input_ids, input_mask, labels=None, token_real_label=token_real_label) if output_mode == "classification": loss_fct = CrossEntropyLoss() seq_loss = loss_fct(seq_logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() seq_loss = loss_fct(seq_logits.view(-1), label_ids.view(-1)) token_real_label = token_real_label.detach().cpu().numpy() w = args.aug_loss_weight if args.only_bert: loss = seq_loss else: loss = (1 - w) * seq_loss + w * aug_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() total_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), 10000.0) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 tr_seq_loss += seq_loss.mean().item() seq_logits = seq_logits.detach().cpu().numpy() label_ids = label_ids.detach().cpu().numpy() if len(preds) == 0: preds.append(seq_logits) all_labels.append(label_ids) else: preds[0] = np.append(preds[0], seq_logits, axis=0) all_labels[0] = np.append(all_labels[0], label_ids, axis=0) if args.only_bert == 0: aug_logits = aug_logits.detach().cpu().numpy() tmp_train_aug_accuracy, tmp_tokens = accuracy( aug_logits, token_real_label, type="aug") train_aug_accuracy += tmp_train_aug_accuracy nb_tr_tokens += tmp_tokens tr_aug_loss += aug_loss.mean().item() if global_step % 20 == 0: loss = tr_loss / nb_tr_steps seq_loss = tr_seq_loss / nb_tr_steps aug_loss = tr_aug_loss / nb_tr_steps tmp_pred = preds[0] tmp_labels = all_labels[0] if output_mode == "classification": tmp_pred = np.argmax(tmp_pred, axis=1) elif output_mode == "regression": tmp_pred = np.squeeze(tmp_pred) res = accuracy(tmp_pred, tmp_labels, task_name=task_name) if nb_tr_tokens != 0: aug_avg = train_aug_accuracy / nb_tr_tokens else: aug_avg = 0.0 log_string = "" log_string += "epoch={:<5d}".format(epoch) log_string += " step={:<9d}".format(global_step) log_string += " total_loss={:<9.7f}".format(loss) log_string += " seq_loss={:<9.7f}".format(seq_loss) log_string += " aug_loss={:<9.7f}".format(aug_loss) log_string += " lr={:<9.7f}".format(scheduler.get_lr()[0]) log_string += " |g|={:<9.7f}".format(total_norm) #log_string += " tr_seq_acc={:<9.7f}".format(seq_avg) log_string += " tr_aug_acc={:<9.7f}".format(aug_avg) log_string += " mins={:<9.2f}".format( float(time.time() - first_time) / 60) for key in sorted(res.keys()): log_string += " " + key + "= " + str(res[key]) logger.info(log_string) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 train_loss = tr_loss / nb_tr_steps if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and epoch % 1 == 0: eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts = \ do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, eval_examples, type="dev") if "acc" in eval_res: tmp_acc = eval_res["acc"] elif "mcc" in eval_res: tmp_acc = eval_res["mcc"] else: tmp_acc = eval_res["corr"] if tmp_acc >= best_val_acc: best_val_acc = tmp_acc dev_test = "dev" model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_dir = os.path.join(args.output_dir, "dev_" + str(tmp_acc)) if not os.path.exists(output_model_dir): os.makedirs(output_model_dir) model_to_save.save_pretrained(output_model_dir) tokenizer.save_pretrained(output_model_dir) output_model_file = os.path.join(output_model_dir, 'pytorch_model.bin') torch.save(model_to_save.state_dict(), output_model_file) result = { 'eval_total_loss': eval_loss, 'eval_seq_loss': eval_seq_loss, 'eval_aug_loss': eval_aug_loss, 'eval_aug_accuracy': eval_aug_accuracy, 'global_step': global_step, 'train_loss': train_loss, 'best_epoch': epoch, 'train_batch_size': args.train_batch_size, 'args': args } result.update(eval_res) result.update(res_parts) output_eval_file = os.path.join( args.output_dir, dev_test + "_results_" + str(tmp_acc) + ".txt") with open(output_eval_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # write test results if args.do_test: res_file = os.path.join( args.output_dir, "test_" + str(tmp_acc) + ".tsv") idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model) dataframe = pd.DataFrame({ 'index': range(idx), 'prediction': preds }) dataframe.to_csv(res_file, index=False, sep='\t') logger.info(" Num test length = %d", idx) logger.info(" Done ") # write mm test results if task_name == "mnli": res_file = os.path.join( args.output_dir, "mm_roberta_results_b_" + str(tmp_acc) + ".tsv") idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, do_mm=True) dataframe = pd.DataFrame({ 'index': range(idx), 'prediction': preds }) dataframe.to_csv(res_file, index=False, sep='\t') logger.info(" Num test length = %d", idx) logger.info(" Done write mm") else: logger.info(" tmp_val_acc = %f", tmp_acc)
def main(): parser = argparse.ArgumentParser() # Required Parameters parser.add_argument( "--output_dir", default='output', type=str, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument("--checkpoint", default='pretrain_ckpt/bert_small_ckpt.bin', type=str, help="checkpoint") parser.add_argument("--resume_checkpoint", default=False, type=bool, help="resume") parser.add_argument('--log_dir', default='./runs', type=str) # Other Parameters parser.add_argument("--train_feature", default='./rsc/train_features.hdf5', type=str, help="SQuAD corpus for post-training.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=4.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument("--num_workers", default=8, type=int, help="Proportion of workers of DataLoader") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O2', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") args = parser.parse_args() summary_writer = SummaryWriter(args.log_dir) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Prepare model model = SampleCNN() # Multi-GPU Setting # if n_gpu > 1: # model = nn.DataParallel(model) num_params = count_parameters(model) logger.info("Total Parameter: %d" % num_params) model.to(device) post_training_dataset = SpeechDataset('./rsc/train.hdf5') num_train_optimization_steps = int( len(post_training_dataset) / args.train_batch_size) * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=num_train_optimization_steps * 0.1, t_total=num_train_optimization_steps) loss_fn = nn.KLDivLoss(reduction='batchmean').to(device) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(post_training_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) num_train_step = num_train_optimization_steps train_dataloader = DataLoader(post_training_dataset, batch_size=args.train_batch_size, num_workers=16, pin_memory=True) model.train() global_step = 0 epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): iter_bar = tqdm( train_dataloader, desc="Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)") tr_step, total_loss, mean_loss = 0, 0., 0. for step, batch in enumerate(iter_bar): feature = batch['feature'].float().to(device) label = batch['label'].float().to(device) output = model(feature) # loss = -F.kl_div(output, label, reduction='batchmean') loss = loss_fn(output, label) # if n_gpu > 1: # loss = loss.mean() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 tr_step += 1 total_loss += loss mean_loss = total_loss / tr_step iter_bar.set_description( "Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" % (global_step, num_train_step, mean_loss, loss.item())) if global_step % 100 == 0: print('output ', output) summary_writer.add_scalar('Train/Total_Mean_Loss', mean_loss, global_step) summary_writer.add_scalar('Train/Total_Loss', loss.item(), global_step) logger.info("***** Saving file *****") if args.resume_checkpoint: model_checkpoint = "pt_bert_from_checkpoint_%d.bin" % (epoch) else: model_checkpoint = "pt_scnn_%d.bin" % (epoch) logger.info(model_checkpoint) output_model_file = os.path.join(args.output_dir, model_checkpoint) # if n_gpu > 1: # torch.save(model.module.state_dict(), output_model_file) # else: torch.save(model.state_dict(), output_model_file) epoch += 1