def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--xlnet_model", default=None, type=str, required=True, help="Either one of the two: 'xlnet-large-cased', 'xlnet-base-cased'.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_steps", default=100, type=int, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() processor = dreamProcessor() label_list = processor.get_labels() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") os.makedirs(args.output_dir, exist_ok=True) ## only use cased model tokenizer = XLNetTokenizer.from_pretrained(args.xlnet_model, do_lower_case=False) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / n_class / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) ## prepare model model = XLNetForSequenceClassification.from_pretrained( args.xlnet_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_choices=3) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.weight'] ## note: no weight decay according to XLNet paper optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: # Adam Epsilon fixed at 1e-6 according to XLNet paper optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) warmup_steps = args.warmup_steps scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) input_ids = [] input_mask = [] segment_ids = [] label_id = [] for f in train_features: input_ids.append([]) input_mask.append([]) segment_ids.append([]) for i in range(n_class): ## put three input sequences tgt input_ids[-1].append(f[i].input_ids) input_mask[-1].append(f[i].input_mask) segment_ids[-1].append(f[i].segment_ids) label_id.append([f[0].label_id]) all_input_ids = torch.tensor(input_ids, dtype=torch.long) all_input_mask = torch.tensor(input_mask, dtype=torch.long) all_segment_ids = torch.tensor(segment_ids, dtype=torch.long) all_label_ids = torch.tensor(label_id, dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in range(int(args.num_train_epochs)): max_score = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, n_class=n_class) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if step % 800 == 0: logger.info("Training loss: {}, global step: {}".format( tr_loss / nb_tr_steps, global_step)) if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running Dev Evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) input_ids = [] input_mask = [] segment_ids = [] label_id = [] for f in eval_features: input_ids.append([]) input_mask.append([]) segment_ids.append([]) for i in range(n_class): input_ids[-1].append(f[i].input_ids) input_mask[-1].append(f[i].input_mask) segment_ids[-1].append(f[i].segment_ids) label_id.append([f[0].label_id]) all_input_ids = torch.tensor(input_ids, dtype=torch.long) all_input_mask = torch.tensor(input_mask, dtype=torch.long) all_segment_ids = torch.tensor(segment_ids, dtype=torch.long) all_label_ids = torch.tensor(label_id, dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits, _ = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, n_class=n_class) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples if args.do_train: result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } else: result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy } output_eval_file = os.path.join(args.output_dir, "eval_results_test.txt") with open(output_eval_file, "a+") as writer: logger.info(" Epoch: %d", (ep + 1)) logger.info("***** Eval results *****") writer.write(" Epoch: " + str(ep + 1)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # output_eval_file = os.path.join(args.output_dir, "logits_test.txt") # with open(output_eval_file, "w") as f: # for i in range(len(logits_all)): # for j in range(len(logits_all[i])): # f.write(str(logits_all[i][j])) # if j == len(logits_all[i])-1: # f.write("\n") # else: # f.write(" ") if eval_accuracy > max_score: max_score = eval_accuracy ## save trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model_{}epoch.bin".format(ep + 1)) torch.save(model_to_save.state_dict(), output_model_file) else: ## save trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model_{}epoch.bin".format(ep + 1)) torch.save(model_to_save.state_dict(), output_model_file)
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'align_mask': batch[2], 'labels': batch[4] } inputs['token_type_ids'] = batch[3] outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: if args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): with open( os.path.join(args.output_dir, "{}.txt".format(key)), 'a+') as w: w.write("%d\t%f\n" % (global_step, value)) with open(os.path.join(args.output_dir, "loss.txt"), 'a+') as w: w.write( "%d\t%f\n" % (global_step, (tr_loss - logging_loss) / args.logging_steps)) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs warm_up_steps = int(args.warmup_steps * t_total) save_steps = int(args.save_steps * t_total) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] a = [] b = [] c = [] d = [] optimizer_grouped_parameters = [] for n, p in model.named_parameters(): if 'classifier' in n or 'linear_r' in n or 'linear_g' in n: if any(nd in n for nd in no_decay): a.append(p) else: b.append(p) else: if any(nd in n for nd in no_decay): c.append(p) else: d.append(p) optimizer_grouped_parameters.append({ "params": a, "weight_decay": 0, "lr": 2e-3 }) optimizer_grouped_parameters.append({ "params": b, "weight_decay": args.weight_decay, "lr": 2e-3 }) optimizer_grouped_parameters.append({"params": c, "weight_decay": 0}) optimizer_grouped_parameters.append({ "params": d, "weight_decay": args.weight_decay }) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warm_up_steps, t_total=t_total) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'align_mask': batch[2], 'labels': batch[4] } inputs['token_type_ids'] = None outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar('lr_n', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('lr_o', scheduler.get_lr()[2], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if save_steps > 0 and global_step % save_steps == 0: # Save model checkpoint if args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--output_dir", default='output', type=str, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument("--checkpoint", default='pretrain_ckpt/bert_small_ckpt.bin', type=str, help="checkpoint") parser.add_argument("--model_config", default='data/bert_small.json', type=str) # Other parameters parser.add_argument("--train_file", default='data/KorQuAD_v1.0_train.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=96, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=8.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O2', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer('data/ko_vocab_32k.txt', max_len=args.max_seq_length, do_basic_tokenize=True) # Prepare model config = Config.from_json_file(args.model_config) model = QuestionAnswering(config) model.bert.load_state_dict(torch.load(args.checkpoint)) num_params = count_parameters(model) logger.info("Total Parameter: %d" % num_params) model.to(device) model = torch.nn.DataParallel(model) cached_train_features_file = args.train_file + '_{0}_{1}_{2}'.format( str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_examples = read_squad_examples(input_file=args.train_file, is_training=True, version_2_with_negative=False) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) num_train_optimization_steps = int( len(train_features) / args.train_batch_size) * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=num_train_optimization_steps * 0.1, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) num_train_step = num_train_optimization_steps input_ids = np.load('input_ids2.npy') input_mask = np.load('input_mask.npy') input_segments = np.load('input_segments.npy') start_prob = np.load('start_prob.npy') end_prob = np.load('end_prob.npy') start_label = np.load('input_start.npy') stop_label = np.load('input_stop.npy') """ for i in range(1000): print(input_ids[i]) print(max(start_prob[i])) print(sum(start_prob[i])) input() """ paragraph = torch.tensor(input_ids.astype( np.int64)).type(dtype=torch.long).cuda() paragraph_mask = torch.tensor(input_mask.astype( np.int64)).type(dtype=torch.long).cuda() paragraph_segments = torch.tensor(input_segments.astype( np.int64)).type(dtype=torch.long).cuda() start_prob = torch.tensor(start_prob.astype( np.float32)).type(dtype=torch.float32).cuda() end_prob = torch.tensor(end_prob.astype( np.float32)).type(dtype=torch.float32).cuda() start_label = torch.tensor(start_label.astype( np.int64)).type(dtype=torch.long).cuda() stop_label = torch.tensor(stop_label.astype( np.int64)).type(dtype=torch.long).cuda() train_data = TensorDataset(paragraph, paragraph_mask, paragraph_segments, start_label, stop_label, start_prob, end_prob) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() global_step = 0 epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): iter_bar = tqdm( train_dataloader, desc="Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)") tr_step, total_loss, mean_loss = 0, 0., 0. for step, batch in enumerate(iter_bar): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, start_probs, end_probs = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions, start_probs, end_probs) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 tr_step += 1 total_loss += loss mean_loss = total_loss / tr_step iter_bar.set_description( "Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" % (global_step, num_train_step, mean_loss, loss.item())) logger.info("** ** * Saving file * ** **") model_checkpoint = "korquad_%d.bin" % (epoch) logger.info(model_checkpoint) output_model_file = os.path.join(args.output_dir, model_checkpoint) if n_gpu > 1: torch.save(model.module.state_dict(), output_model_file) else: torch.save(model.state_dict(), output_model_file) epoch += 1
def train(self): from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') task = self.args.task tb_writer = SummaryWriter(log_dir='./runs/' + task + "/" + current_time + self.args.prefix, comment=self.args.prefix) vocabs, lexical_mapping = self._build_model() train_data = DataLoader(self.args, vocabs, lexical_mapping, self.args.train_data, self.args.batch_size, for_train=True) dev_data = DataLoader(self.args, vocabs, lexical_mapping, self.args.dev_data, self.args.batch_size, for_train=False) test_data = DataLoader(self.args, vocabs, lexical_mapping, self.args.test_data, self.args.batch_size, for_train='Eval') train_data.set_unk_rate(self.args.unk_rate) # WRITE PARAMETERS with open('./' + 'param' + '.txt', 'w') as f: for name, param in self.model.named_parameters(): f.writelines('name:' + name + "\n") f.writelines(str(param)) f.writelines('size:' + str(param.size()) + '\n') no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0. }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] gradient_accumulation_steps = 1 t_total = len( train_data) // gradient_accumulation_steps * self.args.epochs optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=self.args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.args.warmup_steps, t_total=t_total) self.model.zero_grad() set_seed(42, self.args.gpus) batches_acm, loss_acm = 0, 0 # Train! logger.info("***** Running training *****") logger.info(" Task: %s", self.args.task) logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", self.args.epochs) logger.info(" Total optimization steps = %d", t_total) logger.info(" Running Language Model = %s", self.args.lm_model) logger.info(" Running Model = %s", self.args.encoder_type) best_acc = 0 best_model_wts = copy.deepcopy(self.model.state_dict()) total_steps = 0 train_iterator = trange(int(self.args.epochs), desc="Epoch") # initialize the early_stopping object early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) for _ in train_iterator: epoch_iterator = tqdm(train_data, desc="Iteration") running_loss = 0.0 running_corrects = 0 batch_count = self.args.batch_multiplier # Turn on the train mode for step, batch in enumerate(epoch_iterator): self.model.train() batch = move_to_cuda(batch, self.device) logits, labels, ans_ids = self.model(batch, train=True) logits_for_pred = logits.clone().detach() loss = self.criterion(logits, labels) loss_value = loss.item() pred_values, pred_indices = torch.max(logits_for_pred, 1) labels = labels.tolist() pred = pred_indices.tolist() corrects = [i for i, j in zip(labels, pred) if i == j] # Statistics running_loss += loss.item() running_corrects += len(corrects) if batch_count == 0: torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) optimizer.step() scheduler.step() total_steps += 1 optimizer.zero_grad() self.model.zero_grad() batch_count = self.args.batch_multiplier loss_acm += loss_value loss.backward() batch_count -= 1 if (batches_acm % (self.args.batch_multiplier * self.args.batch_size) == 0) & (batches_acm != 0) & (step != 0): logger.info( 'Train Epoch %d, Batch %d, loss %.3f, Accuracy %.3f', _, batches_acm, loss_acm / batches_acm, running_corrects / (self.args.batch_size * step)) tb_writer.add_scalar('Training_loss', loss_acm / batches_acm, batches_acm) tb_writer.add_scalar( 'Training_Accuracy', running_corrects / (self.args.batch_size * step)) torch.cuda.empty_cache() batches_acm += 1 epoch_loss = running_loss / batches_acm epoch_acc = running_corrects / len(train_data) print('{} Loss: {:.4f} Acc: {:.4f}'.format(_, epoch_loss, epoch_acc)) tb_writer.add_scalar('Training_Epoch_loss', epoch_loss, _) tb_writer.add_scalar('Training_Epoch_Accuracy', epoch_acc, _) # Evaluate on Development Set eval_epoch_acc, eval_epoch_loss = self._run_evaluate( dev_data, _, write_answer=False) print('Overall_Dev Acc: {:.4f}'.format(eval_epoch_acc)) tb_writer.add_scalar('Dev_Epoch_Accuracy', eval_epoch_acc, _) ################################## # Evaluate on Test Set test_epoch_acc, test_epoch_loss = self._run_evaluate( test_data, _, write_answer=True) print('Overall_Test Acc: {:.4f}'.format(test_epoch_acc)) tb_writer.add_scalar('Test_Epoch_Accuracy', test_epoch_acc, _) # Save only best accuracy model on dev set if eval_epoch_acc > best_acc: best_acc = eval_epoch_acc best_model_wts = copy.deepcopy(self.model.state_dict()) # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model early_stopping(epoch_acc, self.model) if early_stopping.early_stop: print("Early stopping") break self.model.train() logger.info('Best val Acc: {:4f}'.format(best_acc)) torch.save( { 'args': self.save_args, 'model': best_model_wts }, '%s/epoch%d_batch%d_model_best_%s' % (self.args.ckpt, self.args.epochs, batches_acm, self.args.prefix))
def main(): parser = argparse.ArgumentParser() # Required Parameters parser.add_argument( "--output_dir", default='output', type=str, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument("--checkpoint", default='pretrain_ckpt/bert_small_ckpt.bin', type=str, help="checkpoint") parser.add_argument("--resume_checkpoint", default=False, type=bool, help="resume") parser.add_argument('--log_dir', default='./runs', type=str) # Other Parameters parser.add_argument("--train_feature", default='./rsc/train_features.hdf5', type=str, help="SQuAD corpus for post-training.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=4.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument("--num_workers", default=8, type=int, help="Proportion of workers of DataLoader") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O2', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") args = parser.parse_args() summary_writer = SummaryWriter(args.log_dir) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Prepare model model = SampleCNN() # Multi-GPU Setting # if n_gpu > 1: # model = nn.DataParallel(model) num_params = count_parameters(model) logger.info("Total Parameter: %d" % num_params) model.to(device) post_training_dataset = SpeechDataset('./rsc/train.hdf5') num_train_optimization_steps = int( len(post_training_dataset) / args.train_batch_size) * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=num_train_optimization_steps * 0.1, t_total=num_train_optimization_steps) loss_fn = nn.KLDivLoss(reduction='batchmean').to(device) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(post_training_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) num_train_step = num_train_optimization_steps train_dataloader = DataLoader(post_training_dataset, batch_size=args.train_batch_size, num_workers=16, pin_memory=True) model.train() global_step = 0 epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): iter_bar = tqdm( train_dataloader, desc="Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)") tr_step, total_loss, mean_loss = 0, 0., 0. for step, batch in enumerate(iter_bar): feature = batch['feature'].float().to(device) label = batch['label'].float().to(device) output = model(feature) # loss = -F.kl_div(output, label, reduction='batchmean') loss = loss_fn(output, label) # if n_gpu > 1: # loss = loss.mean() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 tr_step += 1 total_loss += loss mean_loss = total_loss / tr_step iter_bar.set_description( "Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" % (global_step, num_train_step, mean_loss, loss.item())) if global_step % 100 == 0: print('output ', output) summary_writer.add_scalar('Train/Total_Mean_Loss', mean_loss, global_step) summary_writer.add_scalar('Train/Total_Loss', loss.item(), global_step) logger.info("***** Saving file *****") if args.resume_checkpoint: model_checkpoint = "pt_bert_from_checkpoint_%d.bin" % (epoch) else: model_checkpoint = "pt_scnn_%d.bin" % (epoch) logger.info(model_checkpoint) output_model_file = os.path.join(args.output_dir, model_checkpoint) # if n_gpu > 1: # torch.save(model.module.state_dict(), output_model_file) # else: torch.save(model.state_dict(), output_model_file) epoch += 1
def train(args, train_dataset, model, tokenizer): """ Train the model """ tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs warm_up_steps = int(args.warmup_steps * t_total) save_steps = int(args.save_steps * t_total) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warm_up_steps, t_total=t_total) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } inputs[ 'token_type_ids'] = None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[0] label_ids = torch.nn.functional.one_hot(batch[3]).float() tsa_start = 0.5 tsa_threshold = get_tsa_threshold("exp_schedule", global_step, t_total, tsa_start, end=1) larger_than_threshold = torch.exp(-loss) > tsa_threshold loss_mask = torch.ones_like(label_ids) * ( 1 - larger_than_threshold.float()) loss = torch.sum(loss * loss_mask) / torch.max( torch.sum(loss_mask), torch.tensor(1.0).cuda()) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if save_steps > 0 and global_step % save_steps == 0: # Save model checkpoint if args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step