def main(args): # # PART1 # torch.manual_seed(args.seed) model = build_model(args).cuda() one_ll = next(model.children()).weight optimizer = FusedAdam(model.parameters()) ASP.init_model_for_pruning(model, args.pattern, verbosity=args.verbosity, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask) ASP.init_optimizer_for_pruning(optimizer) step = 0 # train for a few steps with dense weights print("DENSE :: ", one_ll) step = train_loop(args, model, optimizer, step, args.num_dense_steps) # simulate sparsity by inserting zeros into existing dense weights ASP.compute_sparse_masks() # train for a few steps with sparse weights print("SPARSE :: ", one_ll) step = train_loop(args, model, optimizer, step, args.num_sparse_steps) torch.save( { 'step': step, 'verbosity': args.verbosity, 'seed2': args.seed2, 'pattern': args.pattern, 'whitelist': args.whitelist, 'allow_recompute_mask': args.allow_recompute_mask, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, args.checkpoint_path)
def main(): print("IN NEW MAIN XD\n") parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--input_dir", default=None, type=str, required=True, help="The input data dir. Should contain .hdf5 files for the task.") parser.add_argument("--config_file", default=None, type=str, required=True, help="The BERT model config") parser.add_argument( "--bert_model", default="bert-large-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--max_predictions_per_seq", default=80, type=int, help="The maximum total of masked tokens in input sequence") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=1000, type=float, help="Total number of training steps to perform.") parser.add_argument( "--warmup_proportion", default=0.01, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0.0, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) parser.add_argument('--log_freq', type=float, default=50.0, help='frequency of logging loss.') parser.add_argument('--checkpoint_activations', default=False, action='store_true', help="Whether to use gradient checkpointing") parser.add_argument("--resume_from_checkpoint", default=False, action='store_true', help="Whether to resume training from checkpoint.") parser.add_argument('--resume_step', type=int, default=-1, help="Step to resume training from.") parser.add_argument( '--num_steps_per_checkpoint', type=int, default=2000, help="Number of update steps until a model checkpoint is saved to disk." ) args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) assert (torch.cuda.is_available()) if args.local_rank == -1: device = torch.device("cuda") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if args.train_batch_size % args.gradient_accumulation_steps != 0: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible" .format(args.gradient_accumulation_steps, args.train_batch_size)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if not args.resume_from_checkpoint and os.path.exists( args.output_dir) and (os.listdir(args.output_dir) and os.listdir( args.output_dir) != ['logfile.txt']): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not args.resume_from_checkpoint: os.makedirs(args.output_dir, exist_ok=True) # Prepare model config = BertConfig.from_json_file(args.config_file) model = BertForPreTraining(config) if not args.resume_from_checkpoint: global_step = 0 else: if args.resume_step == -1: model_names = [ f for f in os.listdir(args.output_dir) if f.endswith(".pt") ] args.resume_step = max([ int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names ]) global_step = args.resume_step checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu") model.load_state_dict(checkpoint['model'], strict=False) print("resume step from ", args.resume_step) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: optimizer = FusedAdam( optimizer_grouped_parameters, lr=args.learning_rate, #warmup=args.warmup_proportion, #t_total=args.max_steps, bias_correction=False, weight_decay=0.01, max_grad_norm=1.0) if args.loss_scale == 0: # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic") else: # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale) scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=args.max_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=args.max_steps) if args.resume_from_checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) # , strict=False) if args.local_rank != -1: model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) ] files.sort() num_files = len(files) logger.info("***** Running training *****") # logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.train_batch_size) print(" LR = ", args.learning_rate) model.train() print("Training. . .") most_recent_ckpts_paths = [] print("Training. . .") tr_loss = 0.0 # total added training loss average_loss = 0.0 # averaged loss every args.log_freq steps epoch = 0 training_steps = 0 while True: if not args.resume_from_checkpoint: random.shuffle(files) f_start_id = 0 else: f_start_id = checkpoint['files'][0] files = checkpoint['files'][1:] args.resume_from_checkpoint = False for f_id in range(f_start_id, len(files)): data_file = files[f_id] logger.info("file no %s file %s" % (f_id, data_file)) train_data = pretraining_dataset( input_file=data_file, max_pred_length=args.max_predictions_per_seq) if args.local_rank == -1: train_sampler = RandomSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=args.train_batch_size * n_gpu, num_workers=4, pin_memory=True) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=4, pin_memory=True) for step, batch in enumerate( tqdm(train_dataloader, desc="File Iteration")): training_steps += 1 batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch #\ loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels, checkpoint_activations=args.checkpoint_activations) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: # optimizer.backward(loss) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss average_loss += loss.item() if training_steps % args.gradient_accumulation_steps == 0: if args.fp16: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if training_steps == 1 * args.gradient_accumulation_steps: logger.info( "Step:{} Average Loss = {} Step Loss = {} LR {}". format(global_step, average_loss, loss.item(), optimizer.param_groups[0]['lr'])) if training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0: logger.info( "Step:{} Average Loss = {} Step Loss = {} LR {}". format(global_step, average_loss / args.log_freq, loss.item(), optimizer.param_groups[0]['lr'])) average_loss = 0 if global_step >= args.max_steps or training_steps % ( args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0: if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)): # Save a trained model logger.info( "** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_save_file = os.path.join( args.output_dir, "ckpt_{}.pt".format(global_step)) torch.save( { 'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'files': [f_id] + files }, output_save_file) most_recent_ckpts_paths.append(output_save_file) if len(most_recent_ckpts_paths) > 3: ckpt_to_be_removed = most_recent_ckpts_paths.pop(0) os.remove(ckpt_to_be_removed) if global_step >= args.max_steps: tr_loss = tr_loss * args.gradient_accumulation_steps / training_steps if (torch.distributed.is_initialized()): tr_loss /= torch.distributed.get_world_size() torch.distributed.all_reduce(tr_loss) logger.info("Total Steps:{} Final Loss = {}".format( training_steps, tr_loss.item())) return del train_dataloader del train_sampler del train_data #for obj in gc.get_objects(): # if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): # del obj torch.cuda.empty_cache() epoch += 1
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--hybrid_attention", action='store_true', help="Whether to use hybrid attention") parser.add_argument("--continue_training", action='store_true', help="Continue training from a checkpoint") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and not args.continue_training: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.hybrid_attention: max_seq_length = args.max_seq_length attention_mask = torch.ones(12, max_seq_length, max_seq_length, dtype=torch.long) # left attention attention_mask[:2, :, :] = torch.tril( torch.ones(max_seq_length, max_seq_length, dtype=torch.long)) # right attention attention_mask[2:4, :, :] = torch.triu( torch.ones(max_seq_length, max_seq_length, dtype=torch.long)) # local attention, window size = 3 attention_mask[4:6, :, :] = torch.triu( torch.tril( torch.ones(max_seq_length, max_seq_length, dtype=torch.long), 1), -1) attention_mask = torch.cat( [attention_mask.unsqueeze(0) for _ in range(8)]) attention_mask = attention_mask.to(device) else: attention_mask = None global_step = 0 epoch_start = 0 if args.do_train: if args.continue_training: # if checkpoint file exists, find the last checkpoint if os.path.exists(args.output_dir) and os.listdir(args.output_dir): all_cp = os.listdir(args.output_dir) steps = [ int(re.search('_\d+', cp).group()[1:]) for cp in all_cp if re.search('_\d+', cp) ] if len(steps) == 0: raise ValueError( "No existing checkpoint. Please do not use --continue_training." ) max_step = max(steps) # load checkpoint checkpoint = torch.load( os.path.join(args.output_dir, 'checkpoints_' + str(max_step) + '.pt')) logger.info("***** Loading checkpoint *****") logger.info(" Num steps = %d", checkpoint['global_step']) logger.info(" Num epoch = %d", checkpoint['epoch']) logger.info(" Loss = %d, %d", checkpoint['loss'], checkpoint['loss_now']) model.module.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) global_step = checkpoint['global_step'] epoch_start = checkpoint['epoch'] del checkpoint else: raise ValueError( "No existing checkpoint. Please do not use --continue_training." ) writer = SummaryWriter(log_dir=os.environ['HOME']) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() tr_loss_1000 = 0 for ep in trange(epoch_start, int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, hybrid_mask=attention_mask) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() tr_loss_1000 += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # log the training loss for every 1000 steps if global_step % 1000 == 999: writer.add_scalar('data/loss', tr_loss_1000 / 1000, global_step) logger.info("training steps: %s", global_step) logger.info("training loss per 1000: %s", tr_loss_1000 / 1000) tr_loss_1000 = 0 # save the checkpoint for every 10000 steps if global_step % 10000 == 0: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_file = os.path.join( args.output_dir, "checkpoints_" + str(global_step) + ".pt") checkpoint = { 'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': ep, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'loss_now': tr_loss_1000 } if args.do_train: torch.save(checkpoint, output_file) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_" + str(ep)) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) logger.info("training loss: %s", tr_loss / nb_tr_steps) # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def run_train(args, hparams): if args.seed is not None: print("Setting numpy random seed to {}...".format(args.seed)) np.random.seed(args.seed) seed_from_numpy = np.random.randint(2147483648) print("Manual seed for pytorch:", seed_from_numpy) torch.manual_seed(seed_from_numpy) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) if n_gpu > 0: torch.cuda.manual_seed_all(seed_from_numpy) # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) # os.makedirs(args.output_dir, exist_ok=True) print("Initializing model...") load_path = args.load_path if load_path is not None: print(f"Loading parameters from {load_path}") info = torch_load(load_path) model = Zmodel.Jointmodel.from_spec(info['spec'], info['state_dict']) hparams = model.hparams Ptb_dataset = PTBDataset(hparams) Ptb_dataset.process_PTB(args) else: hparams.set_from_args(args) Ptb_dataset = PTBDataset(hparams) Ptb_dataset.process_PTB(args) model = Zmodel.Jointmodel( Ptb_dataset.tag_vocab, Ptb_dataset.word_vocab, Ptb_dataset.label_vocab, Ptb_dataset.char_vocab, Ptb_dataset.type_vocab, Ptb_dataset.srl_vocab, hparams, ) print("Hyperparameters:") hparams.print() # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None # num_train_steps = None print("Loading Train Dataset", args.train_file) Ptb_dataset.rand_dataset() # print(model.tokenizer.tokenize("Federal Paper Board sells paper and wood products .")) #max_seq_length = model.bert_max_len train_dataset = BERTDataset(args.pre_wiki_line, hparams, Ptb_dataset, args.train_file, model.tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) task_list = [ 'dev_synconst', 'dev_srlspan', 'dev_srldep', 'test_synconst', 'test_srlspan', 'test_srldep', 'brown_srlspan', 'brown_srldep' ] evaluator = EvalManyTask(device=1, hparams=hparams, ptb_dataset=Ptb_dataset, task_list=task_list, bert_tokenizer=model.tokenizer, seq_len=args.eval_seq_length, eval_batch_size=args.eval_batch_size, evalb_dir=args.evalb_dir, model_path_base=args.save_model_path_base, log_path="{}_log".format("models_log/" + hparams.model_name)) num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model # model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) if load_path is not None: optimizer.load_state_dict(info['optimizer']) global_step = args.pre_step pre_step = args.pre_step # wiki_line = 0 # while train_dataset.wiki_id < wiki_line: # train_dataset.file.__next__().strip() # train_dataset.wiki_id+=1 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on file.__next__ # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) hparams.model_name = args.model_name print("This is ", hparams.model_name) start_time = time.time() def save_args(hparams): arg_path = "{}_log".format("models_log/" + hparams.model_name) + '.arg.json' kwargs = hparams.to_dict() json.dump({'kwargs': kwargs}, open(arg_path, 'w'), indent=4) save_args(hparams) # test_save_path = args.save_model_path_base + "_fortest" # torch.save({ # 'spec': model_to_save.spec, # 'state_dict': model_to_save.state_dict(), # 'optimizer': optimizer.state_dict(), # }, test_save_path + ".pt") # evaluator.test_model_path = test_save_path cur_ptb_epoch = 0 for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 #save_model_path, is_save = evaluator.eval_multitask(start_time, cur_ptb_epoch) epoch_start_time = time.time() for step, batch in enumerate(train_dataloader): model.train() input_ids, origin_ids, input_mask, word_start_mask, word_end_mask, segment_ids, perm_mask, target_mapping, lm_label_ids, lm_label_mask, is_next, \ synconst_list, syndep_head_list, syndep_type_list, srlspan_str_list, srldep_str_list, is_ptb = batch # synconst_list, syndep_head_list, syndep_type_list , srlspan_str_list, srldep_str_list = gold_list dis_idx = [i for i in range(len(input_ids))] dis_idx = torch.tensor(dis_idx) batch = dis_idx, input_ids, origin_ids, input_mask, word_start_mask, word_end_mask, segment_ids, perm_mask, target_mapping, lm_label_ids, lm_label_mask, is_next bert_data = tuple(t.to(device) for t in batch) sentences = [] gold_syntree = [] gold_srlspans = [] gold_srldeps = [] # for data_dict1 in dict1: for synconst, syndep_head_str, syndep_type_str, srlspan_str, srldep_str in zip( synconst_list, syndep_head_list, syndep_type_list, srlspan_str_list, srldep_str_list): syndep_head = json.loads(syndep_head_str) syndep_type = json.loads(syndep_type_str) syntree = trees.load_trees( synconst, [[int(head) for head in syndep_head]], [syndep_type], strip_top=False)[0] sentences.append([(leaf.tag, leaf.word) for leaf in syntree.leaves()]) gold_syntree.append(syntree.convert()) srlspan = {} srlspan_dict = json.loads(srlspan_str) for pred_id, argus in srlspan_dict.items(): srlspan[int(pred_id)] = [(int(a[0]), int(a[1]), a[2]) for a in argus] srldep_dict = json.loads(srldep_str) srldep = {} if str(-1) in srldep_dict: srldep = None else: for pred_id, argus in srldep_dict.items(): srldep[int(pred_id)] = [(int(a[0]), a[1]) for a in argus] gold_srlspans.append(srlspan) gold_srldeps.append(srldep) if global_step < pre_step: if global_step % 1000 == 0: print("global_step:", global_step) print("pre_step:", pre_step) print("Wiki line:", train_dataset.wiki_line) print("total-elapsed {} ".format( format_elapsed(start_time))) global_step += 1 cur_ptb_epoch = train_dataset.ptb_epoch continue bert_loss, task_loss = model(sentences=sentences, gold_trees=gold_syntree, gold_srlspans=gold_srlspans, gold_srldeps=gold_srldeps, bert_data=bert_data) if n_gpu > 1: bert_loss = bert_loss.sum() task_loss = task_loss.sum() loss = bert_loss + task_loss #* 0.1 loss = loss / len(synconst_list) bert_loss = bert_loss / len(synconst_list) task_loss = task_loss / len(synconst_list) tatal_loss = float(loss.data.cpu().numpy()) if bert_loss > 0: bert_loss = float(bert_loss.data.cpu().numpy()) if task_loss > 0: task_loss = float(task_loss.data.cpu().numpy()) # grad_norm = torch.nn.utils.clip_grad_norm_(clippable_parameters, grad_clip_threshold) lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_steps, args.warmup_proportion) print("epoch {:,} " "ptb-epoch {:,} " "batch {:,}/{:,} " "processed {:,} " "PTB line {:,} " "Wiki line {:,} " "total-loss {:.4f} " "bert-loss {:.4f} " "task-loss {:.4f} " "lr_this_step {:.12f} " "epoch-elapsed {} " "total-elapsed {}".format( epoch, cur_ptb_epoch, global_step, int(np.ceil(len(train_dataset) / args.train_batch_size)), (global_step + 1) * args.train_batch_size, train_dataset.ptb_cur_line, train_dataset.wiki_line, tatal_loss, bert_loss, task_loss, lr_this_step, format_elapsed(epoch_start_time), format_elapsed(start_time), )) # if n_gpu > 1: # loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 #if train_dataset.ptb_epoch > cur_ptb_epoch: if global_step % args.pre_step_tosave == 0: cur_ptb_epoch = train_dataset.ptb_epoch save_path = "{}_gstep{}_wiki{}_loss={:.4f}.pt".\ format(args.save_model_path_base, global_step, train_dataset.wiki_line, tatal_loss) model_to_save = model.module if hasattr( model, 'module') else model torch.save( { 'spec': model_to_save.spec, 'state_dict': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), }, save_path) # evaluator.test_model_path = test_save_path # # save_model_path, is_save = evaluator.eval_multitask(start_time, cur_ptb_epoch) # if is_save: # print("Saving new best model to {}...".format(save_model_path)) # torch.save({ # 'spec': model_to_save.spec, # 'state_dict': model_to_save.state_dict(), # 'optimizer': optimizer.state_dict(), # }, save_model_path + ".pt") # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") torch.save( { 'spec': model_to_save.spec, 'state_dict': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), }, args.save_model_path_base + ".pt")
def run_ner_w_args(args): if args.server_ip and args.server_port: # Distant debugging - see # https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of # sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: # train_examples = processor.get_train_examples(args.data_dir) train_examples = processor.get_dev_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForNer.from_pretrained(args.bert_model, cache_dir=cache_dir, config_dir=args.config_dir, num_labels=num_labels, config=args.config) model_to_save = model.module if hasattr(model, 'module') else model # print(model_to_save.config, cache_dir) # print(args.config_dir, args.config) # exit() if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # def resolve_opt(pre_model_path, optimizer): # opt_path = os.path.join(args.bert_model, "opt.pth") # if os.path.exists(opt_path): # optimizer.load_state_dict( torch.load( opt_path ) ) # return optimizer # optimizer = resolve_opt(args.bert_model, optimizer) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) # train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.eval() def warmup_linear(progress, warmup): if progress < warmup: return progress / warmup return max((progress - 1.) / (warmup - 1.), 0.) attention_modules = {} attention_stats_list = [] attention_stats = {} def get_activation(name): def hook(model, input, output): attention_stat = attention_data(model, *input) for _ in attention_stat: attention_stat[_] = attention_stat[_].detach().cpu() # for value in attention_stat.values(): # value = value.detach().cpu() print(input[0].shape) if attention_stats.get(name) is None: attention_stats[name] = [] attention_stats[name].append(attention_stat) return hook def get_attention_modules(model, prefix=''): for name, layer in model._modules.items(): # If module has children, recursively add quant activation to the # sub-modules of the module. if name is 'self': yield (prefix + '.' + name, layer) if len(layer._modules.items()) > 0: yield from get_attention_modules(layer, prefix=prefix + '.' + name) import copy attention_modules = get_attention_modules(model) for name, module in attention_modules: module.register_forward_hook(get_activation(name)) # break readable_seq = [] for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch for b in input_ids: tokens = tokenizer.convert_ids_to_tokens(b.cpu().numpy()) readable_seq.append(tokens) # define a new function to compute loss values for both output_modes loss = model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask) # activations.append(copy.deepcopy(activation)) # attention_stats['input_batch'] = batch attention_stats['batch_seq'] = readable_seq if step == 10: break import pickle outfile = open(os.path.join(args.output_dir, 'attentions'), 'wb') pickle.dump(attention_stats, outfile) outfile.close() exit() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch logits = model(input_ids, segment_ids, input_mask, valid_ids=valid_ids, attention_mask_label=l_mask) # input_ids, input_mask, segment_ids, label_ids = batch # loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles # this automatically lr_this_step = args.learning_rate * \ warmup_linear(global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) # Save optimizer output_optimizer_file = os.path.join(args.output_dir, "opt.pth") torch.save(optimizer.state_dict(), output_optimizer_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) tokenizer.save_vocabulary(args.output_dir) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: # output_config_file = os.path.join(args.output_dir, CONFIG_NAME) # output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) # config = BertConfig(output_config_file) # model = BertForTokenClassification(config, num_labels=num_labels) # model.load_state_dict(torch.load(output_model_file)) model = BertForNer.from_pretrained(args.bert_model, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) model.to(device)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--file_path", default="data/conceptual_caption/", type=str, help="The input train corpus.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-base-uncased, roberta-base, roberta-large, ", ) parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, roberta-base", ) parser.add_argument( "--output_dir", default="save", type=str, # required=True, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", type=str, default="config/bert_base_6layer_6conect.json", help="The config file which specified the model details.", ) ## Other parameters parser.add_argument( "--max_seq_length", default=36, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.", ) parser.add_argument( "--train_batch_size", default=512, type=int, help="Total batch size for training.", ) parser.add_argument( "--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--start_epoch", default=0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--img_weight", default=1, type=float, help="weight for image loss") parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action="store_true", help="Whether to load train samples into memory or use disk", ) parser.add_argument( "--do_lower_case", type=bool, default=True, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--num_workers", type=int, default=25, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--baseline", action="store_true", help="Wheter to use the baseline model (single bert).", ) parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--distributed", action="store_true", help="whether use chunck for parallel training.", ) parser.add_argument("--without_coattention", action="store_true", help="whether pair loss.") parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) parser.add_argument( "--objective", default=0, type=int, help="which objective to use \ 0: with ICA loss, \ 1: with ICA loss, for the not aligned pair, no masking objective, \ 2: without ICA loss, do not sample negative pair.", ) parser.add_argument("--num_negative", default=255, type=int, help="num of negative to use") parser.add_argument("--resume_file", default="", type=str, help="Resume from checkpoint") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") args = parser.parse_args() if args.baseline: from pytorch_pretrained_bert.modeling import BertConfig from vilbert.basebert import BertForMultiModalPreTraining else: from vilbert.vilbert import BertForMultiModalPreTraining, BertConfig if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = args.config_file.split("/")[1].split(".")[0] + prefix savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.from_pretrained + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps cache = 5000 if dist.is_available() and args.local_rank != -1: num_replicas = dist.get_world_size() args.train_batch_size = args.train_batch_size // num_replicas args.num_workers = args.num_workers // num_replicas cache = cache // num_replicas random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if "roberta" in args.bert_model: tokenizer = RobertaTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) else: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) num_train_optimization_steps = None train_dataset = ConceptCapLoaderTrain( args.file_path, tokenizer, args.bert_model, seq_len=args.max_seq_length, batch_size=args.train_batch_size, visual_target=args.visual_target, num_workers=args.num_workers, local_rank=args.local_rank, objective=args.objective, cache=cache, ) validation_dataset = ConceptCapLoaderVal( args.file_path, tokenizer, args.bert_model, seq_len=args.max_seq_length, batch_size=args.train_batch_size, visual_target=args.visual_target, num_workers=2, objective=args.objective, ) num_train_optimization_steps = int( train_dataset.num_dataset / args.train_batch_size / args.gradient_accumulation_steps) * (args.num_train_epochs - args.start_epoch) task_names = ["Conceptual_Caption"] task_ids = ["TASK0"] task_num_iters = { "TASK0": train_dataset.num_dataset / args.train_batch_size } logdir = os.path.join("logs", timeStamp) if default_gpu: tbLogger = utils.tbLogger( logdir, savePath, task_names, task_ids, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if "roberta" in args.bert_model: config.model = "roberta" if args.freeze > config.t_biattention_id[0]: config.fixed_t_layer = config.t_biattention_id[0] if args.without_coattention: config.with_coattention = False if args.dynamic_attention: config.dynamic_attention = True if args.from_pretrained: model = BertForMultiModalPreTraining.from_pretrained( args.from_pretrained, config=config, default_gpu=default_gpu) else: model = BertForMultiModalPreTraining(config) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) if not args.from_pretrained: param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] else: optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if key[12:] in bert_weight_name: lr = args.learning_rate * 0.1 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) # set different parameters for vision branch and lanugage branch. if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam( optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0, ) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98), ) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_proportion * num_train_optimization_steps, t_total=num_train_optimization_steps, ) startIterID = 0 global_step = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace( "module.", "", 1)] = checkpoint["model_state_dict"][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) scheduler.load_state_dict(checkpoint["scheduler_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] del checkpoint model.cuda() for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.fp16: model.half() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataset.num_dataset) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) for epochId in range(int(args.start_epoch), int(args.num_train_epochs)): model.train() for step, batch in enumerate(train_dataset): iterId = startIterID + step + (epochId * len(train_dataset)) image_ids = batch[-1] batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask = ( batch) if args.objective == 1: image_label = image_label * (is_next == 0).long().unsqueeze(1) image_label[image_label == 0] = -1 lm_label_ids = lm_label_ids * (is_next == 0).long().unsqueeze(1) lm_label_ids[lm_label_ids == 0] = -1 masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_target, is_next, ) if args.objective == 2: next_sentence_loss = next_sentence_loss * 0 masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if default_gpu: tbLogger.step_train_CC( epochId, iterId, float(masked_loss_t), float(masked_loss_v), float(next_sentence_loss), optimizer.param_groups[0]["lr"], "TASK0", "train", ) if (step % (20 * args.gradient_accumulation_steps) == 0 and step != 0 and default_gpu): tbLogger.showLossTrainCC() # Do the evaluation torch.set_grad_enabled(False) numBatches = len(validation_dataset) model.eval() for step, batch in enumerate(validation_dataset): image_ids = batch[-1] batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask = ( batch) batch_size = input_ids.size(0) masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_target, is_next, ) masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() if default_gpu: tbLogger.step_val_CC( epochId, float(masked_loss_t), float(masked_loss_v), float(next_sentence_loss), "TASK0", batch_size, "val", ) sys.stdout.write("%d / %d \r" % (step, numBatches)) sys.stdout.flush() if default_gpu: ave_score = tbLogger.showLossValCC() torch.set_grad_enabled(True) if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin") output_checkpoint = os.path.join( savePath, "pytorch_ckpt_" + str(epochId) + ".tar") torch.save(model_to_save.state_dict(), output_model_file) torch.save( { "model_state_dict": model_to_save.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "global_step": global_step, }, output_checkpoint, ) if default_gpu: tbLogger.txt_close()
class Seq2SeqTrainer: """ Seq2SeqTrainer """ def __init__(self, model, criterion, opt_config, print_freq=10, save_freq=1000, grad_clip=float('inf'), batch_first=False, save_info={}, save_path='.', train_iterations=0, checkpoint_filename='checkpoint%s.pth', keep_checkpoints=5, math='fp32', loss_scaling={}, cuda=True, distributed=False, distributed_overlap_allreduce=False, distributed_overlap_num_allreduce_streams=1, distributed_overlap_allreduce_messagesize=1e7, distributed_overlap_allreduce_communicators=None, intra_epoch_eval=0, prealloc_mode='always', iter_size=1, verbose=False): """ Constructor for the Seq2SeqTrainer. :param model: model to train :param criterion: criterion (loss function) :param opt_config: dictionary with options for the optimizer :param print_freq: prints short summary every 'print_freq' iterations :param save_freq: saves checkpoint every 'save_freq' iterations :param grad_clip: coefficient for gradient clipping :param batch_first: if True the model uses (batch,seq,feature) tensors, if false the model uses (seq, batch, feature) :param save_info: dict with additional state stored in each checkpoint :param save_path: path to the directiory for checkpoints :param train_iterations: total number of training iterations to execute :param checkpoint_filename: name of files with checkpoints :param keep_checkpoints: max number of checkpoints to keep :param math: arithmetic type :param loss_scaling: options for dynamic loss scaling :param cuda: if True use cuda, if False train on cpu :param distributed: if True run distributed training :param intra_epoch_eval: number of additional eval runs within each training epoch :param prealloc_mode: controls preallocation, choices=['off', 'once', 'always'] :param iter_size: number of iterations between weight updates :param verbose: enables verbose logging """ super(Seq2SeqTrainer, self).__init__() self.model = model self.criterion = criterion self.epoch = 0 self.save_info = save_info self.save_path = save_path self.save_freq = save_freq self.save_counter = 0 self.checkpoint_filename = checkpoint_filename self.checkpoint_counter = cycle(range(keep_checkpoints)) self.opt_config = opt_config self.cuda = cuda self.distributed = distributed self.print_freq = print_freq self.batch_first = batch_first self.verbose = verbose self.loss = None self.translator = None self.scheduler = None self.intra_epoch_eval = intra_epoch_eval self.iter_size = iter_size self.prealloc_mode = prealloc_mode self.preallocated = False self.retain_allreduce_buffers = True self.gradient_average = False if cuda: self.model = self.model.cuda() self.criterion = self.criterion.cuda() params = self.model.parameters() if math == 'fp16': self.model = self.model.half() if distributed: self.model = DDP( self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce), retain_allreduce_buffers=self.retain_allreduce_buffers, gradient_average=self.gradient_average) self.fp_optimizer = Fp16Optimizer( self.model, grad_clip, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval']) params = [self.fp_optimizer.fp32_params] elif math == 'fp32': if distributed: self.model = DDP( self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce)) self.fp_optimizer = Fp32Optimizer(self.model, grad_clip) # params = self.model.parameters() opt_name = opt_config.pop('optimizer') if opt_name == 'FusedAdam': if math == 'fp16' or math == 'fp32': self.optimizer = FusedAdam(params, **opt_config) else: self.optimizer = FusedAdam( params, use_mt=True, max_grad_norm=grad_clip, amp_scale_adjustment=get_world_size(), **opt_config) else: self.optimizer = torch.optim.__dict__[opt_name](params, **opt_config) if math == 'amp_fp16': self.model, self.optimizer = amp.initialize( self.model, self.optimizer, cast_model_outputs=torch.float16, keep_batchnorm_fp32=False, opt_level='O2') self.fp_optimizer = AMPOptimizer( self.model, grad_clip, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval']) if distributed: self.model = DDP( self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce), num_allreduce_streams= distributed_overlap_num_allreduce_streams, allreduce_communicators= distributed_overlap_allreduce_communicators, retain_allreduce_buffers=self.retain_allreduce_buffers, gradient_average=self.gradient_average) logging.info(f'Using optimizer: {self.optimizer}') mlperf_print(key=mlperf_compliance.constants.OPT_BASE_LR, value=opt_config['lr']) def iterate(self, src, tgt, update=True, training=True): """ Performs one iteration of the training/validation. :param src: batch of examples from the source language :param tgt: batch of examples from the target language :param update: if True: optimizer does update of the weights :param training: if True: executes optimizer """ src, src_length = src tgt, tgt_length = tgt src_length = torch.LongTensor(src_length) tgt_length = torch.LongTensor(tgt_length) num_toks = {} num_toks['tgt'] = int(sum(tgt_length - 1)) num_toks['src'] = int(sum(src_length)) if self.cuda: src = src.cuda() src_length = src_length.cuda() tgt = tgt.cuda() if self.batch_first: output = self.model(src, src_length, tgt[:, :-1]) tgt_labels = tgt[:, 1:] T, B = output.size(1), output.size(0) else: output = self.model(src, src_length, tgt[:-1]) tgt_labels = tgt[1:] T, B = output.size(0), output.size(1) loss = self.criterion(output.view(T * B, -1), tgt_labels.contiguous().view(-1)) loss_per_batch = loss.item() loss /= (B * self.iter_size) if training: self.fp_optimizer.step(loss, self.optimizer, self.scheduler, update) loss_per_token = loss_per_batch / num_toks['tgt'] loss_per_sentence = loss_per_batch / B return loss_per_token, loss_per_sentence, num_toks def feed_data(self, data_loader, training=True): """ Runs training or validation on batches from data_loader. :param data_loader: data loader :param training: if True runs training else runs validation """ if training: assert self.optimizer is not None eval_fractions = np.linspace(0, 1, self.intra_epoch_eval + 2)[1:-1] iters_with_update = len(data_loader) // self.iter_size eval_iters = (eval_fractions * iters_with_update).astype(int) eval_iters = eval_iters * self.iter_size eval_iters = set(eval_iters) batch_time = AverageMeter(skip_first=False) data_time = AverageMeter(skip_first=False) losses_per_token = AverageMeter(skip_first=False) losses_per_sentence = AverageMeter(skip_first=False) tot_tok_time = AverageMeter(skip_first=False) src_tok_time = AverageMeter(skip_first=False) tgt_tok_time = AverageMeter(skip_first=False) batch_size = data_loader.batch_size end = time.time() for i, (src, tgt) in enumerate(data_loader): self.save_counter += 1 # measure data loading time data_time.update(time.time() - end) update = False if i % self.iter_size == self.iter_size - 1: update = True # do a train/evaluate iteration stats = self.iterate(src, tgt, update, training=training) loss_per_token, loss_per_sentence, num_toks = stats # measure accuracy and record loss losses_per_token.update(loss_per_token, num_toks['tgt']) losses_per_sentence.update(loss_per_sentence, batch_size) # measure elapsed time elapsed = time.time() - end batch_time.update(elapsed) src_tok_time.update(num_toks['src'] / elapsed) tgt_tok_time.update(num_toks['tgt'] / elapsed) tot_num_toks = num_toks['tgt'] + num_toks['src'] tot_tok_time.update(tot_num_toks / elapsed) self.loss = losses_per_token.avg if training and i in eval_iters: assert self.translator is not None test_bleu, _ = self.translator.run(calc_bleu=True, epoch=self.epoch, iteration=i) log = [] log += [f'TRAIN [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'BLEU: {test_bleu:.2f}'] log = '\t'.join(log) logging.info(log) self.model.train() self.preallocate(data_loader.batch_size, data_loader.dataset.max_len, training=True) if i % self.print_freq == 0: phase = 'TRAIN' if training else 'VALIDATION' log = [] log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})'] log += [f'Data {data_time.val:.2e} ({data_time.avg:.2e})'] log += [ f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})' ] if self.verbose: log += [ f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})' ] log += [ f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})' ] log += [ f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})' ] log += [ f'Loss/tok {losses_per_token.val:.4f} ({losses_per_token.avg:.4f})' ] if training: lr = self.optimizer.param_groups[0]['lr'] log += [f'LR {lr:.3e}'] log = '\t'.join(log) logging.info(log) save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1) if training and save_chkpt: self.save_counter = 0 self.save_info['iteration'] = i identifier = next(self.checkpoint_counter, -1) if identifier != -1: with sync_workers() as rank: if rank == 0: self.save(identifier=identifier) end = time.time() tot_tok_time.reduce('sum') losses_per_token.reduce('mean') return losses_per_token.avg, tot_tok_time.avg def preallocate(self, batch_size, max_length, training): """ Generates maximum sequence length batch and runs forward and backward pass without updating model parameters. :param batch_size: batch size for preallocation :param max_length: max sequence length for preallocation :param training: if True preallocates memory for backward pass """ if self.prealloc_mode == 'always' or (self.prealloc_mode == 'once' and not self.preallocated): logging.info('Executing preallocation') torch.cuda.empty_cache() src_length = [max_length] * batch_size tgt_length = [max_length] * batch_size if self.batch_first: shape = (batch_size, max_length) else: shape = (max_length, batch_size) src = torch.full(shape, 4, dtype=torch.int64) tgt = torch.full(shape, 4, dtype=torch.int64) src = src, src_length tgt = tgt, tgt_length self.iterate(src, tgt, update=False, training=training) self.model.zero_grad() self.preallocated = True def optimize(self, data_loader): """ Sets model in training mode, preallocates memory and runs training on data provided by data_loader. :param data_loader: data loader """ torch.set_grad_enabled(True) self.model.train() self.preallocate(data_loader.batch_size, data_loader.dataset.max_len, training=True) output = self.feed_data(data_loader, training=True) self.model.zero_grad() return output def evaluate(self, data_loader): """ Sets model in eval mode, disables gradients, preallocates memory and runs validation on data provided by data_loader. :param data_loader: data loader """ torch.set_grad_enabled(False) self.model.eval() self.preallocate(data_loader.batch_size, data_loader.dataset.max_len, training=False) output = self.feed_data(data_loader, training=False) self.model.zero_grad() return output def load(self, filename): """ Loads checkpoint from filename. :param filename: path to the checkpoint file """ if os.path.isfile(filename): checkpoint = torch.load(filename, map_location={'cuda:0': 'cpu'}) if self.distributed: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) self.fp_optimizer.initialize_model(self.model) self.optimizer.load_state_dict(checkpoint['optimizer']) assert self.scheduler is not None self.scheduler.load_state_dict(checkpoint['scheduler']) self.epoch = checkpoint['epoch'] self.loss = checkpoint['loss'] logging.info(f'Loaded checkpoint {filename} (epoch {self.epoch})') else: logging.error(f'Invalid checkpoint: {filename}') def save(self, identifier=None, is_best=False, save_all=False): """ Stores checkpoint to a file. :param identifier: identifier for periodic checkpoint :param is_best: if True stores checkpoint to 'model_best.pth' :param save_all: if True stores checkpoint after completed training epoch """ def write_checkpoint(state, filename): filename = os.path.join(self.save_path, filename) logging.info(f'Saving model to {filename}') torch.save(state, filename) if self.distributed: model_state = self.model.module.state_dict() else: model_state = self.model.state_dict() assert self.scheduler is not None state = { 'epoch': self.epoch, 'state_dict': model_state, 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict(), 'loss': getattr(self, 'loss', None), } state = dict(list(state.items()) + list(self.save_info.items())) if identifier is not None: filename = self.checkpoint_filename % identifier write_checkpoint(state, filename) if is_best: filename = 'model_best.pth' write_checkpoint(state, filename) if save_all: filename = f'checkpoint_epoch_{self.epoch:03d}.pth' write_checkpoint(state, filename)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument( "--test_set", default='story', type=str, #choices=['story', 'news', 'chat', 'train'], help="Choose the test set.") parser.add_argument("--no_logit_mask", action='store_true', help="Whether not to use logit mask") parser.add_argument("--eval_every_epoch", action='store_true', help="Whether to evaluate for every epoch") parser.add_argument("--use_weight", action='store_true', help="Whether to use class-balancing weight") parser.add_argument( "--state_dir", default="", type=str, help= "Where to load state dict instead of using Google pre-trained model") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") processor = DataProcessor(args.test_set) label_list = processor.get_labels(args.data_dir) num_labels = len(label_list) logger.info("num_labels:" + str(num_labels)) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: #raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if os.path.exists(os.path.join(args.output_dir, WEIGHTS_NAME)): output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = PolyphonyLSTM(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: raise ValueError( "Output directory ({}) already exists but no model checkpoint was found." .format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) if args.state_dir and os.path.exists(args.state_dir): state_dict = torch.load(args.state_dir) print("Using my own BERT state dict.") elif args.state_dir and not os.path.exists(args.state_dir): print( "Warning: the state dict does not exist, using the Google pre-trained model instead." ) state_dict = None else: state_dict = None model = PolyphonyLSTM.from_pretrained(args.bert_model, cache_dir=cache_dir, state_dict=state_dict, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if os.path.exists(os.path.join(args.output_dir, OPTIMIZER_NAME)): output_optimizer_file = os.path.join(args.output_dir, OPTIMIZER_NAME) optimizer.load_state_dict(torch.load(output_optimizer_file)) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features, masks, weight = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) if args.eval_every_epoch: eval_examples = processor.get_dev_examples(args.data_dir) eval_features, masks, weight = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) if args.no_logit_mask: print("Remove logit mask") masks = None if not args.use_weight: weight = None print(weight) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_label_poss = torch.tensor([f.label_pos for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_label_poss) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, label_ids, label_poss = batch # print(masks.size()) loss = model(input_ids, input_mask, label_ids, logit_masks=masks, weight=weight) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.eval_every_epoch: # evaluate for every epoch # save model and load for a single GPU model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME + '_' + str(ep)) torch.save(model_to_save.state_dict(), output_model_file) output_optimizer_file = os.path.join( args.output_dir, OPTIMIZER_NAME + '_' + str(ep)) torch.save(optimizer.state_dict(), output_optimizer_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME + '_' + str(ep)) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model_eval = PolyphonyLSTM(config, num_labels=num_labels) model_eval.load_state_dict(torch.load(output_model_file)) model_eval.to(device) if args.no_logit_mask: print("Remove logit mask") masks = None else: masks = masks.to(device) chars = [f.char for f in eval_features] print(len(set(chars)), sorted(list(set(chars)))) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) all_label_poss = torch.tensor( [f.label_pos for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_label_poss) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model_eval.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 res_list = [] for input_ids, input_mask, label_ids, label_poss in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) label_ids = label_ids.to(device) label_poss = label_poss.to(device) with torch.no_grad(): tmp_eval_loss = model_eval(input_ids, input_mask, label_ids, logit_masks=masks) logits = model_eval(input_ids, input_mask, label_ids, logit_masks=masks, cal_loss=False) # print(logits.size()) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() res_list += accuracy_list(logits, label_ids, label_poss) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None acc = sum(res_list) / len(res_list) char_count = {k: [] for k in list(set(chars))} for i, c in enumerate(chars): char_count[c].append(res_list[i]) char_acc = { k: sum(char_count[k]) / len(char_count[k]) for k in char_count } result = { 'epoch': ep + 1, 'eval_loss': eval_loss, 'eval_accuracy': acc, 'global_step': global_step, 'loss': loss } logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) output_eval_file = os.path.join( args.output_dir, "epoch_" + str(ep + 1) + ".txt") with open(output_eval_file, 'w') as f: f.write( json.dumps(result, ensure_ascii=False) + '\n' + json.dumps(char_acc, ensure_ascii=False)) # multi processing # if n_gpu > 1: # model = torch.nn.DataParallel(model) if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_optimizer_file = os.path.join(args.output_dir, OPTIMIZER_NAME) torch.save(optimizer.state_dict(), output_optimizer_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = PolyphonyLSTM(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: # model = BertForPolyphonyMulti.from_pretrained(args.bert_model, num_labels = num_labels) pass model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = PolyphonyLSTM(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) eval_examples = processor.get_dev_examples(args.data_dir) eval_features, masks, weight = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) if args.no_logit_mask: print("Remove logit mask") masks = None else: masks = masks.to(device) chars = [f.char for f in eval_features] print(len(set(chars)), sorted(list(set(chars)))) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_label_poss = torch.tensor([f.label_pos for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_label_poss) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 res_list = [] # masks = masks.to(device) for input_ids, input_mask, label_ids, label_poss in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) label_ids = label_ids.to(device) label_poss = label_poss.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, input_mask, label_ids, logit_masks=masks) logits = model(input_ids, input_mask, label_ids, logit_masks=masks, cal_loss=False) # print(logits.size()) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) res_list += accuracy_list(logits, label_ids, label_poss) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None acc = sum(res_list) / len(res_list) char_count = {k: [] for k in list(set(chars))} for i, c in enumerate(chars): char_count[c].append(res_list[i]) char_acc = { k: sum(char_count[k]) / len(char_count[k]) for k in char_count } result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss, 'acc': acc } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) for key in sorted(char_acc.keys()): logger.info(" %s = %s", key, str(char_acc[key])) writer.write("%s = %s\n" % (key, str(char_acc[key]))) print("mean accuracy", sum(char_acc[c] for c in char_acc) / len(char_acc)) output_acc_file = os.path.join(args.output_dir, args.test_set + ".json") output_reslist_file = os.path.join(args.output_dir, args.test_set + "reslist.json") with open(output_acc_file, "w") as f: f.write(json.dumps(char_acc, ensure_ascii=False)) with open(output_reslist_file, "w") as f: f.write(json.dumps(res_list, ensure_ascii=False))
with torch.no_grad(): test_acc, test_cnt = test(testloader, args.save_test, epoch) test_acc = torch.Tensor([test_acc * test_cnt]).to(device) test_cnt = torch.Tensor([test_cnt]).to(device) if args.local_rank >= 0: distrib.reduce(test_acc, 0) distrib.reduce(test_cnt, 0) if args.local_rank == 0: test_acc /= test_cnt test_acc = test_acc.item() if args.local_rank <= 0: print() print('Accuracy after epoch {0}: {1}'.format(epoch, test_acc)) print() if args.test_only: break if args.local_rank <= 0 and not args.test_only: is_best = test_acc > best_acc best_acc = max(best_acc, test_acc) save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_acc': best_acc }, checkpoint=args.checkpoint, is_best=is_best)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_training_data', type=Path, required=True) parser.add_argument('--pregenerated_dev_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( '--bert_model', type=str, required=True, help='Bert pre-trained model selected in the list: bert-base-uncased, ' 'bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.' ) parser.add_argument('--do_lower_case', action='store_true') parser.add_argument( '--reduce_memory', action='store_true', help= 'Store training data as on-disc memmaps to massively reduce memory usage' ) parser.add_argument('--epochs', type=int, default=3, help='Number of epochs to train for') parser.add_argument('--local_rank', type=int, default=-1, help='local_rank for distributed training on gpus') parser.add_argument('--no_cuda', action='store_true', help='Whether not to use CUDA when available') parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= 'Number of updates steps to accumulate before performing a backward/update pass.' ) parser.add_argument('--train_batch_size', default=32, type=int, help='Total batch size for training.') parser.add_argument( '--fp16', action='store_true', help='Whether to use 16-bit float precision instead of 32-bit') parser.add_argument( '--loss_scale', type=float, default=0, help= 'Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n' '0 (default value): dynamic loss scaling.\n' 'Positive power of 2: static loss scaling value.\n') parser.add_argument( '--warmup_proportion', default=0.1, type=float, help= 'Proportion of training to perform linear learning rate warmup for. ' 'E.g., 0.1 = 10%% of training.') parser.add_argument('--learning_rate', default=3e-5, type=float, help='The initial learning rate for Adam.') parser.add_argument('--seed', type=int, default=42, help='random seed for initialization') args = parser.parse_args() assert args.pregenerated_training_data.is_dir(), \ '--pregenerated_training_data should point to the folder of files made by pregenerate_training_data.py!' samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_training_data / f'epoch_{i}.json' metrics_file = args.pregenerated_training_data / f'epoch_{i}_metrics.json' if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit('No training data was found!') print( f'Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).' ) print( 'This script will loop over the available data, but training diversity may be negatively impacted.' ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda', args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( 'device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}'. format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( 'Invalid gradient_accumulation_steps parameter: {}, should be >= 1' .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f'Output directory ({args.output_dir}) already exists and is not empty!' ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( 'Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.' ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( 'Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.' ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # Track loss train_loss_history = list() dev_loss_history = list() # Start training global_step = 0 logging.info('***** Running training *****') logging.info(f' Num examples = {total_train_examples}') logging.info(f' Batch size = {args.train_batch_size}') logging.info(f' Num steps = {num_train_optimization_steps} \n') for epoch in range(args.epochs): # Train model model.train() epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_training_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, train_or_dev='train', reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f'Epoch {epoch}') as train_pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 train_pbar.update(1) mean_train_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps if step % 10 == 0: train_loss_history.append((epoch, mean_train_loss)) train_pbar.set_postfix_str(f'Loss: {mean_train_loss:.5f}') if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Evaluate dev loss model.eval() dev_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_dev_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, train_or_dev='dev', reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(dev_dataset) else: train_sampler = DistributedSampler(dev_dataset) dev_dataloader = DataLoader(dev_dataset, sampler=train_sampler, batch_size=args.train_batch_size) dev_loss = 0 nb_dev_examples, nb_dev_steps = 0, 0 with tqdm(total=len(dev_dataloader), desc=f'Epoch {epoch}') as dev_pbar: for step, batch in enumerate(dev_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() dev_loss += loss.item() nb_dev_examples += input_ids.size(0) nb_dev_steps += 1 dev_pbar.update(1) mean_dev_loss = dev_loss * args.gradient_accumulation_steps / nb_dev_steps dev_pbar.set_postfix_str(f'Loss: {mean_dev_loss:.5f}') dev_loss_history.append( (epoch, mean_dev_loss)) # Only collect final mean dev loss # Save training progress with optimizer logging.info('** ** * Saving training progress * ** **') Path(args.output_dir / f'{epoch}/').mkdir(exist_ok=True) output_model_file = args.output_dir / f'{epoch}/model_and_opt.bin' torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': tr_loss, }, str(output_model_file)) # Save easily-loadable model module logging.info(f'** ** * Saving fine-tuned model {epoch} * ** ** \n') model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = args.output_dir / f'{epoch}/{WEIGHTS_NAME}' output_config_file = args.output_dir / f'{epoch}/{CONFIG_NAME}' torch.save(model_to_save.state_dict(), str(output_model_file)) model_to_save.config.to_json_file(str(output_config_file)) tokenizer.save_vocabulary(args.output_dir) # Save loss history after every epoch with open(args.output_dir / f'{epoch}/loss_history.json', 'a') as h: hist = {'dev': dev_loss_history, 'train': train_loss_history} h.write(f'{json.dumps(hist)}\n')
def main(): parser = argparse.ArgumentParser(fromfile_prefix_chars="@") parser.add_argument("--pregenerated_data", type=Path, required=True, help="The input train corpus.") parser.add_argument("--epochs", type=int, required=True) parser.add_argument("--bert_model", type=str, required=True) parser.add_argument("--bert_config_file", type=str, default="bert_config.json") parser.add_argument("--vocab_file", type=str, default="senti_vocab.txt") parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument("--model_name", type=str, default="senti_base_model") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--world_size", type=int, default=4) parser.add_argument("--start_rank", type=int, default=0) parser.add_argument("--server", type=str, default="tcp://127.0.0.1:1234") parser.add_argument("--load_model", action="store_true") parser.add_argument("--load_model_name", type=str, default="large_model") parser.add_argument("--save_step", type=int, default=100000) parser.add_argument("--train_batch_size", default=4, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" print("local_rank : ", args.local_rank) samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl', init_method=args.server, rank=args.local_rank + args.start_rank, world_size=args.world_size) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logger.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = Tokenizer( os.path.join(args.bert_model, "senti_vocab.txt"), os.path.join(args.bert_model, "RoBERTa_Sentiment_kor")) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = math.ceil(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = math.ceil( num_train_optimization_steps / torch.distributed.get_world_size()) # Prepare model config = BertConfig.from_json_file( os.path.join(args.bert_model, args.bert_config_file)) logger.info('{}'.format(config)) ############################################### # Load Model if args.load_model: load_model_name = os.path.join(args.output_dir, args.load_model_name) model = BertForPreTraining.from_pretrained( args.bert_model, state_dict=torch.load(load_model_name)["state_dict"]) else: model = BertForPreTraining(config) ############################################### if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP model = DDP(model) except ImportError: from torch.nn.parallel import DistributedDataParallel as DDP model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) epoch0 = 0 global_step = 0 if args.load_model: ############################################### # Load Model logger.info(f"***** Load Model {args.load_model_name} *****") loaded_states = torch.load(os.path.join(args.output_dir, args.load_model_name), map_location=device) optimizer.load_state_dict(loaded_states["optimizer"]) regex = re.compile(r'\d+epoch') epoch0 = int( regex.findall(args.load_model_name)[-1].replace('epoch', '')) logger.info('extract {} -> epoch0 : {}'.format(args.load_model_name, epoch0)) ############################################### logger.info("***** Running training *****") logger.info(f" Num examples = {total_train_examples}") logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() # model.eval() for epoch in range(epoch0, args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc='training..') as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, lm_label_ids = batch loss = model(input_ids, input_mask, lm_label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps if (step + 1) % 50 == 0: pbar.set_description( "Epoch = {}, global_step = {}, loss = {:.5f}".format( epoch, global_step + 1, mean_loss)) logger.info( "Epoch = {}, global_step = {}, loss = {:.5f}".format( epoch, global_step + 1, mean_loss)) if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % args.save_step == 0: if args.local_rank == -1 or args.local_rank == 0: logger.info( "** ** * Saving {} - step model ** ** * ".format( global_step)) output_model_file = os.path.join( args.output_dir, args.model_name + "_{}step".format(global_step)) model_to_save = model.module if hasattr( model, 'module') else model state = { "state_dict": model_to_save.state_dict(), "optimizer": optimizer.state_dict() } torch.save(state, output_model_file) if args.local_rank == -1 or args.local_rank == 0: logger.info( "** ** * Saving {} - epoch model ** ** * ".format(epoch)) output_model_file = os.path.join( args.output_dir, args.model_name + "_{}epoch".format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model state = { "state_dict": model_to_save.state_dict(), "optimizer": optimizer.state_dict() } torch.save(state, output_model_file)
def main_worker(gpu, ngpus_per_node, args): # GPU args.gpu = gpu print("There are ", torch.cuda.device_count(), " available GPUs!") # print('Setting GPUs {}'.format(args.device)) print('Using GPU devices {}'.format(devices)) device = torch.device('cuda', args.gpu) torch.cuda.set_device(device) print('Current single GPU: {}'.format(torch.cuda.current_device())) # randomness np.random.seed(args.seed) prng = np.random.RandomState() torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # For multiprocessing distributed training, rank needs to be the global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu print('Setting rank', args.rank) recon_attempt = 1 connected = False if args.rank != 0: # Stall to have rank 0 node go first time.sleep(3) while not connected: try: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) connected = True print('Established connection. Rank:', args.rank) except Exception as e: # Sometimes the head node launches after the worker, which would cause an issue print('Failed to init process group. Retrying...', recon_attempt, e) recon_attempt += 1 time.sleep(10) # logging if args.rank == 0: save_folder = os.path.join(args.out_dir, args.experiment) os.makedirs(save_folder, exist_ok=True) t_writer = SummaryWriter(os.path.join(save_folder, 'train'), flush_secs=5) v_writer = SummaryWriter(os.path.join(save_folder, 'val'), flush_secs=5) importlib.reload(logging) logging.basicConfig(filename=os.path.join(save_folder, 'train.log'), level=logging.INFO, format='%(asctime)s--- %(message)s') logging.info( '\n*******************************************************************************\n' ) logging.info("the configuration:") logging.info(str(args).replace(',', '\n')) print('Loading models...') cache_dir = os.path.join(args.out_dir, 'model_cache') os.makedirs(cache_dir, exist_ok=True) # Load pre-trained teacher tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=cache_dir) # Hack to allow tokenizing longer sequences. tokenizer.max_len = int(1e12) model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir=cache_dir) if args.load: if args.load == 'none': print('Randomly initializing model weights...') model.apply(model.init_weights) else: print('Loading model weights...') model.load_state_dict( torch.load(os.path.join(args.load, 'model_latest.pt'), map_location='cpu')) gc.collect() print('params:', num_params(model)) # gpt2: 124439808 print('Done.') print('Setup data...') # Batch and sequence length schedule assert len(args.batch_sizes) == len(args.seq_lens) batch_schedule = list( zip(map(int, args.batch_sizes), map(int, args.seq_lens))) assert len( batch_schedule) == 2, 'Currently not supporting multiple schedule' cur_b_schedule = len(batch_schedule) - 1 if args.switch_time == 0 else 0 # add special tokens special_tokens_dict = { 'pad_token': '<|startoftext|>', 'cls_token': '<|startofcond|>', 'sep_token': '<|sepofcond|>', 'mask_token': '<|endofcond|>' } num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) print('We have added', num_added_toks, 'special tokens') # Notice: resize_token_embeddings expect to receive the full size of the new vocab model.resize_token_embeddings(len(tokenizer)) assert tokenizer.pad_token == '<|startoftext|>' print('Batch schedule', batch_schedule) train_loader, val_loader = prepare_dataset( args.data_dir, args.dataset, tokenizer, batch_schedule[cur_b_schedule][0], batch_schedule[cur_b_schedule][1], batch_schedule[-1][0], batch_schedule[-1][1], num_workers=args.workers, model_type=args.model_type) print('Done.') if args.fp16: model = model.half() model = model.to(device) print('Wrapping models and optimizers...') # Apply linear scaling rule to increase batch size for short sequence training. lr_schedule = switch_schedule( linear_schedule(args), batch_schedule[cur_b_schedule][0] / batch_schedule[-1][0], int(args.iterations * args.switch_time)) params = [p for p in model.parameters() if p.requires_grad] optimizer = FusedAdam(params, lr=args.lr) # warning, suggest to use model, optimizer = amp.initialize(model, optimizer, opt_level='O1') optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True, verbose=False) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer.optimizer, lr_schedule) loss_model = SimpleDistributedDataParallel(model, args.world_size) loss_fn = nn.CrossEntropyLoss(reduction='none') print('Done.') print('Begin training iterations') logging.info("Begin training iterations") max_val_batches = 1000 # max num. of val batches save_interval = 50000 # num. of inter to save a model logging.info("Total iteration: %d" % args.iterations) e = 0 # number of epoch num_iters = 0 optimizer.zero_grad() def val_step(val_loader): with torch.no_grad(): logging.info("Validation loop. Batches: %d" % len(val_loader)) logging.info("Validation loop. max_val_batches: %d" % max_val_batches) stats = [] # Validation # input_tokens, target_tokens, mask = next(iter(val_loader)) with tqdm(total=min(len(val_loader), max_val_batches)) as pbar: for i, (input_tokens, target_tokens, mask) in enumerate(val_loader): loss, ce_loss = compute_loss(device, model, input_tokens, target_tokens, mask, loss_fn) stats.append([loss.item(), math.exp(ce_loss.item())]) if i > max_val_batches: break pbar.update(1) stats = np.mean(stats, axis=0) v_writer.add_scalar('loss', stats[0], num_iters) v_writer.add_scalar('ppl', stats[1], num_iters) logging.info('val loss: %.4f' % stats[0]) logging.info('val ppl: %.4f' % stats[1]) while num_iters < args.iterations: # Run epoch st = time.time() # Training print('Training loop. Batches:', len(train_loader)) logging.info( '\n----------------------------------------------------------------------' ) logging.info("Training loop. Batches: %d" % len(train_loader)) logging.info("Training loop. save_interval: %d" % save_interval) # train_iter = iter(train_loader); input_tokens, target_tokens, mask = next(train_iter) with tqdm(total=len(train_loader)) as pbar: for i, (input_tokens, target_tokens, mask) in enumerate(train_loader): # Normal grad step optimizer.zero_grad() loss, ce_loss = train_step(device, loss_model, optimizer, input_tokens, target_tokens, mask, loss_fn) optimizer.step() if args.rank == 0: lr = scheduler.get_last_lr()[0] # Log to Tensorboard t_writer.add_scalar('loss', loss, num_iters) t_writer.add_scalar('ppl', math.exp(ce_loss), num_iters) t_writer.add_scalar('lr', lr, num_iters) t_writer.add_scalar('iter_time', time.time() - st, num_iters) st = time.time() end = num_iters >= args.iterations if args.warmup != -1: scheduler.step() if end: break num_iters += 1 pbar.update(1) if args.switch_time > 0 and num_iters == int( args.iterations * args.switch_time): print('Switch to long sequence training') logging.info("Switch to long sequence training") cur_b_schedule += 1 train_loader, val_loader = prepare_dataset( args.dataset_dir, args.dataset_name, tokenizer, batch_schedule[cur_b_schedule][0], batch_schedule[cur_b_schedule][1], batch_schedule[-1][0], batch_schedule[-1][1]) e += 1 logging.info("Training loop. The ith epoch completed: %d" % e) if args.rank == 0: print('Saving model...') logging.info( '\n------------------------------------------------------') logging.info("Iteration completed: %d, remained %d" % (num_iters, args.iterations - num_iters)) logging.info("Saving model...") #torch.save(model.state_dict(), os.path.join(save_folder, 'model_{:02d}.pt'.format(num_iters // save_interval))) torch.save(model.state_dict(), os.path.join(save_folder, 'model_latest.pt')) torch.save(optimizer.state_dict(), os.path.join(save_folder, 'opt_latest.pt')) torch.save(scheduler.state_dict(), os.path.join(save_folder, 'scheduler_latest.pt')) val_step(val_loader) print('Training complete.') logging.info("Training complete.")
class ExperimentBase(): # TODO train and val as pipelines def __init__(self, cfg=None): cfg = cfg or self.cfg # try reading from static variable self.datasets = {} self.pipelines = {} self.init_config(cfg) self.init_transforms() def init_config(self, cfg): self.cfg = cfg self.workdir = Path(cfg['dir_checkpoint']) self.workdir.mkdir(exist_ok=True, parents=True) (self.workdir / 'config.json').write_text(cfg_json_encode(self.cfg)) def print_cfg(self): print_cfg(self.cfg) def init_transforms(self): """ Init and store transforms that take time to construct The transforms will be used in self.construct_default_pipeline """ # Lifecycle of a frame # in dataset: # dset.tr_post_load_pre_cache # dset.tr_output # in experiment: pass #def sampler_for_dset(self, role, dset): #tr_role = self.tr_input_per_role.get(role, None) #tr_in = self.tr_input if tr_role is None else TrsChain(tr_role, self.tr_input) #collate_fn = partial(self.dataloader_collate, tr_in, self.tr_input_post_batch) #args = self.loader_args_for_role(role) #return DataLoader(dset, collate_fn=collate_fn, **args) def set_dataset(self, role, dset): """ role "train" or "val" """ self.datasets[role] = dset def load_checkpoint(self, chk_name='chk_best.pth'): dir_chk = Path(self.cfg['dir_checkpoint']) path_chk = dir_chk / chk_name if path_chk.is_file(): log.info(f'Loading checkpoint found at {path_chk}') return torch.load(path_chk, map_location='cpu') else: log.info(f'No checkpoint at at {path_chk}') return None def init_net(self, role): """ Role: val or train - determines which checkpoint is loaded""" if role == 'train': chk = self.load_checkpoint(chk_name='chk_last.pth') chk_opt = self.load_checkpoint(chk_name='optimizer.pth') self.build_net(role, chk=chk) self.build_optimizer(role, chk_optimizer=chk_opt) self.net_mod.train() elif role == 'eval': chk = self.load_checkpoint(chk_name='chk_best.pth') self.build_net(role, chk=chk) self.net_mod.eval() else: raise NotImplementedError(f'role={role}') if chk is not None: self.state = GrumpyDict(chk['state']) else: self.state = train_state_init() def build_net(self, role, chk=None, chk_optimizer=None): """ Build net and optimizer (if we train) """ log.info('Building net') @staticmethod def load_checkpoint_to_net(net_mod, chk_object): (missing_keys, superfluous_keys) = net_mod.load_state_dict(chk_object['weights'], strict=False) if missing_keys: log.warning( f'Missing keys when loading a checkpoint: {missing_keys}') if superfluous_keys: log.warning( f'Missing keys when loading a checkpoint: {superfluous_keys}') def build_optimizer(self, role, chk_optimizer=None): log.info('Building optimizer') cfg_opt = self.cfg['train']['optimizer'] network = self.net_mod self.optimizer = AdamOptimizer( [p for p in network.parameters() if p.requires_grad], lr=cfg_opt['learn_rate'], weight_decay=cfg_opt.get('weight_decay', 0), ) self.learn_rate_scheduler = ReduceLROnPlateau( self.optimizer, patience=cfg_opt['lr_patience'], min_lr=cfg_opt['lr_min'], ) if chk_optimizer is not None: self.optimizer.load_state_dict(chk_optimizer['optimizer']) def init_loss(self): log.info('Building loss_mod') def init_log(self, fids_to_display=[]): """ :param fids_to_display: ids of frames to show in tensorboard """ # log for the current training run self.tboard = SummaryWriter(self.workdir / f"tb_{self.state['run_name']}") # save ground truth here to compare in tensorboard self.tboard_gt = SummaryWriter(self.workdir / 'tb_gt') self.tboard_img = SummaryWriter(self.workdir / 'tb_img') self.train_out_dir = self.workdir / f"imgs_{self.state['run_name']}" self.train_out_dir.mkdir(exist_ok=True, parents=True) # names of the frames to display def short_frame_name(fn): # remove directory path if '/' in fn: fn = os.path.basename(fn) return fn self.fids_to_display = set(fids_to_display) self.short_frame_names = { fid: short_frame_name(fid) for fid in self.fids_to_display } def log_selected_images(self, fid, frame, **_): if fid in self.fids_to_log: log.warning('log_selected_images: not implemented') def init_default_datasets(self): pass def init_pipelines(self): for role in ['train', 'val', 'test']: self.pipelines[role] = self.construct_default_pipeline(role) def get_epoch_limit(self): return self.cfg['train'].get('epoch_limit', None) def cuda_modules(self, attr_names): if torch.cuda.is_available(): attr_names = [attr_names] if isinstance(attr_names, str) else attr_names for an in attr_names: setattr(self, an, getattr(self, an).cuda()) def training_start_batch(self, **_): self.optimizer.zero_grad() def training_backpropagate(self, loss, **_): #if torch.any(torch.isnan(loss)): # print('Loss is NAN, cancelling backpropagation in batch') #raise Exception('Stopping training so we can investigate where the nan is coming from') #else: loss.backward() self.optimizer.step() def training_epoch_start(self, epoch_idx): self.net_mod.train() # set train mode for dropout and batch-norm def training_epoch(self, epoch_idx): self.training_epoch_start(epoch_idx) out_frames = self.pipelines['train'].execute( dset=self.datasets['train'], b_grad=True, b_pbar=False, b_accumulate=True, log_progress_interval=self.cfg['train'].get( 'progress_interval', None), short_epoch=self.cfg['train'].get('short_epoch_train', None), ) gc.collect() results_avg = Frame({ # the loss may be in fp16, let's average it at high precision to avoid NaN fn: np.mean(np.array([pf[fn] for pf in out_frames], dtype=np.float64)) for fn in out_frames[0].keys() if fn.lower().startswith('loss') }) self.training_epoch_finish(epoch_idx, results_avg) return results_avg['loss'] def training_epoch_finish(self, epoch_idx, results_avg): for name, loss_avg in results_avg.items(): self.tboard.add_scalar('train_' + name, loss_avg, epoch_idx) def val_epoch_start(self, epoch_idx): self.net_mod.eval() def val_epoch_finish(self, epoch_idx, results_avg): self.learn_rate_scheduler.step(results_avg['loss']) for name, loss_avg in results_avg.items(): self.tboard.add_scalar('val_' + name, loss_avg, epoch_idx) def val_epoch(self, epoch_idx): self.val_epoch_start(epoch_idx) out_frames = self.pipelines['val'].execute( dset=self.datasets['val'], b_grad=False, b_pbar=False, b_accumulate=True, short_epoch=self.cfg['train'].get('short_epoch_val', None), ) gc.collect() results_avg = Frame({ fn: np.mean([pf[fn] for pf in out_frames]) for fn in out_frames[0].keys() if fn.lower().startswith('loss') }) self.val_epoch_finish(epoch_idx, results_avg) return results_avg['loss'] def run_epoch(self, epoch_idx): gc.collect() epoch_limit = self.get_epoch_limit() log.info('E {ep:03d}{eplimit}\n train start'.format( ep=epoch_idx, eplimit=f' / {epoch_limit}' if epoch_limit is not None else '', )) t_train_start = time.time() loss_train = self.training_epoch(epoch_idx) gc.collect() t_val_start = time.time() log.info(' train finished t={tt}s loss_t={ls}, val starting'.format( tt=t_val_start - t_train_start, ls=loss_train, )) gc.collect() loss_val = self.val_epoch(epoch_idx) gc.collect() log.info(' val finished t={tt}s loss_e={ls}'.format( tt=time.time() - t_val_start, ls=loss_val, )) is_best = loss_val < self.state['best_loss_val'] if is_best: self.state['best_loss_val'] = loss_val is_chk_scheduled = epoch_idx % self.cfg['train'][ 'checkpoint_interval'] == 0 if is_best or is_chk_scheduled: self.save_checkpoint(epoch_idx, is_best, is_chk_scheduled) def save_checkpoint(self, epoch_idx, is_best, is_scheduled): # TODO separate methods for saving various parts of the experiment chk_dict = dict() chk_dict['weights'] = self.net_mod.state_dict() chk_dict['state'] = dict(self.state) path_best = self.workdir / 'chk_best.pth' path_last = self.workdir / 'chk_last.pth' if is_scheduled: pytorch_save_atomic(chk_dict, path_last) pytorch_save_atomic( dict(epoch_idx=epoch_idx, optimizer=self.optimizer.state_dict()), self.workdir / 'optimizer.pth', ) if is_best: log.info(' New best checkpoint') if is_scheduled: # we already saved to chk_last.pth shutil.copy(path_last, path_best) else: pytorch_save_atomic(chk_dict, path_best) def training_run(self, b_initial_eval=True): name = self.cfg['name'] log.info(f'Experiment {name} - train') path_stop = self.workdir / 'stop' if b_initial_eval: log.info('INIT\n initial val') loss_val = self.val_epoch(self.state['epoch_idx']) log.info(' init loss_e={le}'.format(le=loss_val)) self.state['best_loss_val'] = loss_val else: self.state['best_loss_val'] = 1e4 b_continue = True while b_continue: self.state['epoch_idx'] += 1 self.run_epoch(self.state['epoch_idx']) if path_stop.is_file(): log.info('Stop file detected') path_stop.unlink() # remove file b_continue = False epoch_limit = self.get_epoch_limit() if (epoch_limit is not None) and (self.state['epoch_idx'] >= epoch_limit): log.info(f'Reached epoch limit {epoch_limit}') b_continue = False @classmethod def training_procedure(cls): print(f'-- Training procesure for {cls.__name__} --') exp = cls() log_config_file(exp.workdir / 'training.log') log.info(f'Starting training job for {cls.__name__}') exp.print_cfg() try: exp.init_default_datasets() exp.init_net("train") log.info(f'Name of the run: {exp.state["run_name"]}') exp.init_transforms() exp.init_loss() exp.init_log() exp.init_pipelines() exp.training_run() # if training crashes, put the exception in the log except Exception as e: log.exception(f'Exception in taining procedure: {e}') def predict_sequence(self, dset, consumer=None, pbar=True): """ If consumer is specified, it will be used for online processing: frames will be given to it instead of being accumulated """ self.net_mod.eval() out_frames = self.pipelines['test'].execute( dset=self.datasets['test'], b_grad=False, b_pbar=pbar, b_accumulate=True, ) return out_frames def loader_args_for_role(self, role): if role == 'train': return dict( shuffle=True, batch_size=self.cfg['net']['batch_train'], num_workers=self.cfg['train'].get('num_workers', 0), drop_last=True, ) elif role == 'val' or role == 'test': return dict( shuffle=False, batch_size=self.cfg['net']['batch_eval'], num_workers=self.cfg['train'].get('num_workers', 0), drop_last=False, ) else: raise NotImplementedError("role: " + role) def construct_default_pipeline(self, role): if role == 'train': tr_batch = TrsChain([ TrCUDA(), self.training_start_batch, self.net_mod, self.loss_mod, self.training_backpropagate, ]) tr_output = TrsChain([ TrKeepFieldsByPrefix('loss'), # save loss for averaging later TrNP(), # clear away the gradients if any are left ]) elif role == 'val': tr_batch = TrsChain([ TrCUDA(), self.net_mod, self.loss_mod, ]) tr_output = TrsChain([ self.log_selected_images, TrKeepFieldsByPrefix('loss'), # save loss for averaging later TrNP(), # clear away the gradients if any are left ]) elif role == 'test': tr_batch = TrsChain([ TrCUDA(), self.net_mod, ]) tr_output = TrsChain([ TrNP(), tr_untorch_images, ]) return Pipeline( tr_batch=tr_batch, tr_output=tr_output, loader_args=self.loader_args_for_role(role), ) def run_evaluation(self, eval_obj, dset=None, b_one_batch=False): pipe_test = self.construct_default_pipeline('test') dset = dset or eval_obj.get_dset() eval_obj.construct_transforms(dset) pipe_test.tr_batch.append(eval_obj.tr_batch) pipe_test.tr_output.append(eval_obj.tr_output) log.info(f'Test pipeline: {pipe_test}') pipe_test.execute(dset, b_accumulate=False, b_one_batch=b_one_batch)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--src_file", default=None, type=str, help="The input data file name.") parser.add_argument("--tgt_file", default=None, type=str, help="The output data file name.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--log_dir", default='', type=str, required=True, help="The output directory where the log will be written.") parser.add_argument("--model_recover_path", default=None, type=str, required=True, help="The file of fine-tuned pretraining model.") parser.add_argument("--optim_recover_path", default=None, type=str, help="The file of pretraining optimizer.") # Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--local_debug", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--label_smoothing", default=0, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="The weight decay rate for Adam.") parser.add_argument("--finetune_decay", action='store_true', help="Weight decay to the original weights.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, help="Dropout rate for hidden states.") parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float, help="Dropout rate for attention probabilities.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp32_embedding', action='store_true', help= "Whether to use 32-bit float precision instead of 16-bit for embeddings" ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument( '--from_scratch', action='store_true', help= "Initialize parameters with random values (i.e., training from scratch)." ) parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--new_pos_ids', action='store_true', help="Use new position ids for LMs.") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument('--max_len_a', type=int, default=0, help="Truncate_config: maximum length of segment A.") parser.add_argument('--max_len_b', type=int, default=0, help="Truncate_config: maximum length of segment B.") parser.add_argument( '--trunc_seg', default='', help="Truncate_config: first truncate segment A/B (option: a, b).") parser.add_argument( '--always_truncate_tail', action='store_true', help="Truncate_config: Whether we should always truncate tail.") parser.add_argument( "--mask_prob", default=0.15, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument( "--mask_prob_eos", default=0, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument('--max_pred', type=int, default=20, help="Max tokens of prediction.") parser.add_argument("--num_workers", default=0, type=int, help="Number of workers for the data loader.") parser.add_argument('--mask_source_words', action='store_true', help="Whether to mask source words for training") parser.add_argument('--skipgram_prb', type=float, default=0.0, help='prob of ngram mask') parser.add_argument('--skipgram_size', type=int, default=1, help='the max size of ngram mask') parser.add_argument('--mask_whole_word', action='store_true', help="Whether masking a whole word.") parser.add_argument('--do_l2r_training', action='store_true', help="Whether to do left to right training") parser.add_argument( '--has_sentence_oracle', action='store_true', help="Whether to have sentence level oracle for training. " "Only useful for summary generation") parser.add_argument('--max_position_embeddings', type=int, default=None, help="max position embeddings") parser.add_argument('--relax_projection', action='store_true', help="Use different projection layers for tasks.") parser.add_argument('--ffn_type', default=0, type=int, help="0: default mlp; 1: W((Wx+b) elem_prod x);") parser.add_argument('--num_qkv', default=0, type=int, help="Number of different <Q,K,V>.") parser.add_argument('--seg_emb', action='store_true', help="Using segment embedding for self-attention.") parser.add_argument( '--s2s_special_token', action='store_true', help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.") parser.add_argument('--s2s_add_segment', action='store_true', help="Additional segmental for the encoder of S2S.") parser.add_argument( '--s2s_share_segment', action='store_true', help= "Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment)." ) parser.add_argument('--pos_shift', action='store_true', help="Using position shift for fine-tuning.") args = parser.parse_args() assert Path( args.model_recover_path).exists(), "--model_recover_path doesn't exist" args.output_dir = args.output_dir.replace('[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) args.log_dir = args.log_dir.replace('[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.log_dir, exist_ok=True) json.dump(args.__dict__, open(os.path.join(args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs dist.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) if args.max_position_embeddings: tokenizer.max_len = args.max_position_embeddings data_tokenizer = WhitespaceTokenizer( ) if args.tokenized_input else tokenizer if args.local_rank == 0: dist.barrier() if args.do_train: print("Loading Train Dataset", args.data_dir) bi_uni_pipeline = [ seq2seq_loader.Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_source_words=args.mask_source_words, skipgram_prb=args.skipgram_prb, skipgram_size=args.skipgram_size, mask_whole_word=args.mask_whole_word, mode="s2s", has_oracle=args.has_sentence_oracle, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift) ] file_oracle = None if args.has_sentence_oracle: file_oracle = os.path.join(args.data_dir, 'train.oracle') fn_src = os.path.join(args.data_dir, args.src_file if args.src_file else 'train.src') fn_tgt = os.path.join(args.data_dir, args.tgt_file if args.tgt_file else 'train.tgt') train_dataset = seq2seq_loader.Seq2SeqDataset( fn_src, fn_tgt, args.train_batch_size, data_tokenizer, args.max_seq_length, file_oracle=file_oracle, bi_uni_pipeline=bi_uni_pipeline, corpus_preprocessors=corpus_preprocessors) train_dataset.initial() print(len(train_dataset.ex_list)) print(train_dataset.batch_size) # assert 1==0 if args.local_rank == -1: train_sampler = RandomSampler(train_dataset, replacement=False) _batch_size = args.train_batch_size else: train_sampler = DistributedSampler(train_dataset) _batch_size = args.train_batch_size // dist.get_world_size() train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=_batch_size, sampler=train_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) # c = 0 # for i_epoch in trange(0, int(args.num_train_epochs)+1, desc="Epoch", disable=args.local_rank not in (-1, 0)): # if args.local_rank != -1: # train_sampler.set_epoch(i_epoch) # iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)', # disable=args.local_rank not in (-1, 0)) # for step, batch in enumerate(iter_bar): # batch = [ # t.to(device) if t is not None else None for t in batch] # if args.has_sentence_oracle: # input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, sop_label, oracle_pos, oracle_weights, oracle_labels = batch # else: # input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, sop_label = batch # oracle_pos, oracle_weights, oracle_labels = None, None, None # c += input_ids.shape[0] # # # print(input_ids) # # # # print(input_ids.shape) # # print(segment_ids) # # print(segment_ids.shape) # # print(is_next) # # print(task_idx) # # print(sop_label) # # print(task_idx.shape) # # for i in range(input_mask.shape[0]): # # print(input_mask[i]) # print(c) # print(train_dataset.c) # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) # t_total = int(math.ceil(len(train_dataset.ex_list) / args.train_batch_size) t_total = int( len(train_dataloader) * args.num_train_epochs / args.gradient_accumulation_steps) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model recover_step = _get_max_epoch_model(args.output_dir) cls_num_labels = 2 type_vocab_size = 6 + \ (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2 num_sentlvl_labels = 2 if args.has_sentence_oracle else 0 relax_projection = 4 if args.relax_projection else 0 if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() if (recover_step is None) and (args.model_recover_path is None): # if _state_dict == {}, the parameters are randomly initialized # if _state_dict == None, the parameters are initialized with bert-init _state_dict = {} if args.from_scratch else None model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=_state_dict, num_labels=cls_num_labels, num_rel=0, type_vocab_size=type_vocab_size, config_path=args.config_path, task_idx=3, num_sentlvl_labels=num_sentlvl_labels, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, relax_projection=relax_projection, new_pos_ids=args.new_pos_ids, ffn_type=args.ffn_type, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, num_qkv=args.num_qkv, seg_emb=args.seg_emb, local_debug=args.local_debug) global_step = 0 else: if recover_step: logger.info("***** Recover model: %d *****", recover_step) model_recover = torch.load(os.path.join( args.output_dir, "model.{0}.bin".format(recover_step)), map_location='cpu') # recover_step == number of epochs global_step = math.floor(recover_step * t_total / args.num_train_epochs) elif args.model_recover_path: logger.info("***** Recover model: %s *****", args.model_recover_path) model_recover = torch.load(args.model_recover_path, map_location='cpu') global_step = 0 model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, num_rel=0, type_vocab_size=type_vocab_size, config_path=args.config_path, task_idx=3, num_sentlvl_labels=num_sentlvl_labels, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, relax_projection=relax_projection, new_pos_ids=args.new_pos_ids, ffn_type=args.ffn_type, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, num_qkv=args.num_qkv, seg_emb=args.seg_emb, local_debug=args.local_debug) if args.local_rank == 0: dist.barrier() if args.fp16: model.half() if args.fp32_embedding: model.bert.embeddings.word_embeddings.float() model.bert.embeddings.position_embeddings.float() model.bert.embeddings.token_type_embeddings.float() model.to(device) if args.local_rank != -1: try: from torch.nn.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("DistributedDataParallel") model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelImbalance(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from pytorch_pretrained_bert.optimization_fp16 import FP16_Optimizer_State from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer_State(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer_State(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if recover_step: logger.info("***** Recover optimizer: %d *****", recover_step) optim_recover = torch.load(os.path.join( args.output_dir, "optim.{0}.bin".format(recover_step)), map_location='cpu') if hasattr(optim_recover, 'state_dict'): optim_recover = optim_recover.state_dict() optimizer.load_state_dict(optim_recover) if args.loss_scale == 0: logger.info("***** Recover optimizer: dynamic_loss_scale *****") optimizer.dynamic_loss_scale = True logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.do_train: logger.info("***** Running training *****") logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) model.train() if recover_step: start_epoch = recover_step + 1 else: start_epoch = 1 for i_epoch in trange(start_epoch, int(args.num_train_epochs) + 1, desc="Epoch", disable=args.local_rank not in (-1, 0)): if args.local_rank != -1: train_sampler.set_epoch(i_epoch) iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)', disable=args.local_rank not in (-1, 0)) for step, batch in enumerate(iter_bar): batch = [ t.to(device) if t is not None else None for t in batch ] if args.has_sentence_oracle: input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, sop_label, oracle_pos, oracle_weights, oracle_labels = batch else: input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, sop_label = batch print(sop_label) print(task_idx) oracle_pos, oracle_weights, oracle_labels = None, None, None # loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, # masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, # masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, # masked_labels_2=oracle_labels, mask_qkv=mask_qkv) loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, sop_label, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv) masked_lm_loss, next_sentence_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. # loss = loss.mean() masked_lm_loss = masked_lm_loss.mean() next_sentence_loss = next_sentence_loss.mean() print('mask_lm_loss {}'.format(masked_lm_loss)) print('next_sentence_loss {}'.format(next_sentence_loss)) print('----------------------------------------------') loss = masked_lm_loss + next_sentence_loss # logging for each step (i.e., before normalization by args.gradient_accumulation_steps) iter_bar.set_description('Iter (loss=%5.3f)' % loss.item()) # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * \ warmup_linear(global_step/t_total, args.warmup_proportion) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model if (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info( "** ** * Saving fine-tuned model and optimizer ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "model.{0}.bin".format(i_epoch)) torch.save(model_to_save.state_dict(), output_model_file) output_optim_file = os.path.join( args.output_dir, "optim.{0}.bin".format(i_epoch)) torch.save(optimizer.state_dict(), output_optim_file) logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache()
def train(args): max_seq_length = args.max_seq_length batch_size = args.batch_size tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # each time only runs one epoch n_epochs = 1 # current epoch in sequence of calls to this script epoch = args.epoch learning_rate = args.learning_rate warmup_proportion = 0.1 checkpoint_directory = os.path.join(args.output, 'qamodel') if not os.path.exists(checkpoint_directory): os.makedirs(checkpoint_directory) traindata = os.path.join(args.output, 'walkthrough_train_commands_real.csv') validdata = os.path.join(args.output, 'walkthrough_valid_commands_real.csv') qatrain, qavalid = generate_qa_datasets(traindata, validdata) if args.nrows: qatrain = qatrain[:args.nrows] qavalid = qavalid[:args.nrows] model = QAModel.from_pretrained('bert-base-uncased', num_labels=2) if epoch > 0: checkpoint_name = os.path.join(checkpoint_directory, 'checkpoint_{}.tar'.format(epoch-1)) checkpoint = torch.load(checkpoint_name, map_location='cpu') model.load_state_dict(checkpoint['state']) if args.fp16: model.half() num_train_steps = int(qatrain.shape[0] / batch_size * n_epochs) # dummy label map label_map = {0:0, 1:1} train_features = convert_examples_to_features(qatrain.text.values, qatrain.target.values, max_seq_length, tokenizer, label_map, texts_b=qatrain.command.values) all_input_ids = torch.tensor([f[0] for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f[1] for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f[2] for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f[3] for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) model = model.to(DEVICE) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] if args.fp16: optimizer = FusedAdam(optimizer_grouped_parameters, lr=learning_rate, bias_correction=False, max_grad_norm=1.0) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=0.1, t_total=num_train_steps) model.train() global_step = 0 predictions = np.array([]) labels = np.array([]) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(DEVICE) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits, loss = model(input_ids, segment_ids, input_mask, label_ids) if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = learning_rate * warmup_linear(global_step/num_train_steps, warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() model.zero_grad() global_step += 1 logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() predictions = np.concatenate([predictions, np.argmax(logits, axis=1)]) labels = np.concatenate([labels, label_ids]) if global_step % 1000 == 0: acc = metrics.accuracy_score(labels, predictions) f1 = metrics.f1_score(labels, predictions) print('[{}] Loss {:f} Acc {:f} F1 {:f}'.format(step, tr_loss/nb_tr_steps, acc, f1)) # Save checkpoint each epoch checkpoint_path = os.path.join(checkpoint_directory, 'checkpoint_{}.tar'.format(epoch)) torch.save({ 'iteration': epoch, 'state': model.state_dict(), 'opt': optimizer.state_dict(), }, checkpoint_path) # the model used by the agent points to this symlink symlink_path = os.path.join(checkpoint_directory, 'checkpoint_final.tar') if os.path.exists(symlink_path): os.unlink(symlink_path) os.symlink(checkpoint_path, symlink_path) validate(args, qavalid, epoch, model) if args.clean and epoch > 0: os.remove(os.path.join(checkpoint_directory, 'checkpoint_{}.tar'.format(epoch-1)))
def run_ner_w_args(args): if args.server_ip and args.server_port: # Distant debugging - see # https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of # sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForNer.from_pretrained(args.bert_model, cache_dir=cache_dir, config_dir=args.config_dir, num_labels=num_labels, config=args.config) model_to_save = model.module if hasattr(model, 'module') else model # print(model_to_save.config, cache_dir) # print(args.config_dir, args.config) # exit() if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # def resolve_opt(pre_model_path, optimizer): # opt_path = os.path.join(args.bert_model, "opt.pth") # if os.path.exists(opt_path): # optimizer.load_state_dict( torch.load( opt_path ) ) # return optimizer # optimizer = resolve_opt(args.bert_model, optimizer) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) # train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() def warmup_linear(progress, warmup): if progress < warmup: return progress / warmup return max((progress - 1.) / (warmup - 1.), 0.) for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch loss = model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask) # input_ids, input_mask, segment_ids, label_ids = batch # loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles # this automatically lr_this_step = args.learning_rate * \ warmup_linear(global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) # Save optimizer output_optimizer_file = os.path.join(args.output_dir, "opt.pth") torch.save(optimizer.state_dict(), output_optimizer_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) tokenizer.save_vocabulary(args.output_dir) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: # output_config_file = os.path.join(args.output_dir, CONFIG_NAME) # output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) # config = BertConfig(output_config_file) # model = BertForTokenClassification(config, num_labels=num_labels) # model.load_state_dict(torch.load(output_model_file)) model = BertForNer.from_pretrained(args.bert_model, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) # eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} # for input_ids, input_mask, segment_ids, label_ids in # tqdm(eval_dataloader, desc="Evaluating"): for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, valid_ids=valid_ids, attention_mask_label=l_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == 11: y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) loss = tr_loss / global_step if args.do_train else None result = dict() result['loss'] = loss report = classification_report(y_true, y_pred, digits=4) logger.info("\n%s", report) print(report) result['f1'] = f1_score(y_true, y_pred) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) # writer.write(report) for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) return result
def run(args, local_rank): """ Distributed Synchronous """ torch.manual_seed(1234) vocab = Vocab(args.vocab, min_occur_cnt=args.min_occur_cnt, specials=[]) if (args.world_size == 1 or dist.get_rank() == 0): print (vocab.size) model = BIGLM(local_rank, vocab, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.layers, args.approx) if args.start_from is not None: ckpt = torch.load(args.start_from, map_location='cpu') model.load_state_dict(ckpt['model']) model = model.cuda(local_rank) weight_decay_params = [] no_weight_decay_params = [] for name, param in model.named_parameters(): if name.endswith('bias') or 'layer_norm' in name: no_weight_decay_params.append(param) else: weight_decay_params.append(param) grouped_params = [{'params':weight_decay_params, 'weight_decay':0.01}, {'params':no_weight_decay_params, 'weight_decay':0.}] if args.world_size > 1: torch.manual_seed(1234 + dist.get_rank()) random.seed(5678 + dist.get_rank()) if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") optimizer = FusedAdam(grouped_params, lr=args.lr, betas=(0.9, 0.999), eps =1e-6, bias_correction=False, max_grad_norm=1.0) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = AdamWeightDecayOptimizer(grouped_params, lr=args.lr, betas=(0.9, 0.999), eps=1e-6) if args.start_from is not None: optimizer.load_state_dict(ckpt['optimizer']) train_data = DataLoader(vocab, args.train_data, args.batch_size, args.max_len) batch_acm = 0 acc_acm, ntokens_acm, npairs_acm, loss_acm = 0., 0., 0., 0. while True: model.train() for truth, inp, msk in train_data: batch_acm += 1 if batch_acm <= args.warmup_steps: update_lr(optimizer, args.lr*batch_acm/args.warmup_steps) truth = truth.cuda(local_rank) inp = inp.cuda(local_rank) msk = msk.cuda(local_rank) optimizer.zero_grad() res, loss, acc, ntokens, npairs = model(truth, inp, msk) loss_acm += loss.item() acc_acm += acc ntokens_acm += ntokens npairs_acm += npairs if args.fp16: optimizer.backward(loss) else: loss.backward() if args.world_size > 1: average_gradients(model) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() if (args.world_size==1 or dist.get_rank() ==0) and batch_acm%args.print_every == -1%args.print_every: print ('batch_acm %d, loss %.3f, acc %.3f, x_acm %d'%(batch_acm, loss_acm/args.print_every, acc_acm/ntokens_acm, npairs_acm)) acc_acm, ntokens_acm, loss_acm = 0., 0., 0. if (args.world_size==1 or dist.get_rank() ==0) and batch_acm%args.save_every == -1%args.save_every: if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) torch.save({'args':args, 'model':model.state_dict(), 'optimizer':optimizer.state_dict()}, '%s/epoch%d_batch_%d'%(args.save_dir, train_data.epoch_id, batch_acm))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) #Train File parser.add_argument("--src_file", default=None, type=str, help="The input data src file name.") parser.add_argument("--tgt_file", default=None, type=str, help="The input data tgt file name.") parser.add_argument("--check_file", default=None, type=str, help="The input check knowledge data file name") #KS File parser.add_argument("--ks_src_file", default=None, type=str, help="The input ks data src file name.") parser.add_argument("--ks_tgt_file", default=None, type=str, help="The input ks data tgt file name.") parser.add_argument("--predict_input_file", default=None, type=str, help="predict_input_file") parser.add_argument("--predict_output_file", default=None, type=str, help="predict_output_file") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--log_dir", default='', type=str, required=True, help="The output directory where the log will be written.") parser.add_argument("--model_recover_path", default=None, type=str, required=True, help="The file of fine-tuned pretraining model.") parser.add_argument("--optim_recover_path", default=None, type=str, help="The file of pretraining optimizer.") parser.add_argument("--predict_bleu", default=0.2, type=float, help="The Predicted Bleu for KS Predict ") # Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run ks predict.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--train_avg_bpe_length", default=25, type=int, help="average bpe length for train.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--label_smoothing", default=0, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="The weight decay rate for Adam.") parser.add_argument("--finetune_decay", action='store_true', help="Weight decay to the original weights.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion_step", default=300, type=int, help= "Proportion of training to perform linear learning rate warmup for. ") parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, help="Dropout rate for hidden states.") parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float, help="Dropout rate for attention probabilities.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=67, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp32_embedding', action='store_true', help= "Whether to use 32-bit float precision instead of 16-bit for embeddings" ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument( '--from_scratch', action='store_true', help= "Initialize parameters with random values (i.e., training from scratch)." ) parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--new_pos_ids', action='store_true', help="Use new position ids for LMs.") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument('--max_len_a', type=int, default=0, help="Truncate_config: maximum length of segment A.") parser.add_argument('--max_len_b', type=int, default=0, help="Truncate_config: maximum length of segment B.") parser.add_argument( '--trunc_seg', default='', help="Truncate_config: first truncate segment A/B (option: a, b).") parser.add_argument( '--always_truncate_tail', action='store_true', help="Truncate_config: Whether we should always truncate tail.") parser.add_argument( "--mask_prob", default=0.15, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument( "--mask_prob_eos", default=0, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument('--max_pred', type=int, default=20, help="Max tokens of prediction.") parser.add_argument("--num_workers", default=0, type=int, help="Number of workers for the data loader.") parser.add_argument('--mask_source_words', action='store_true', help="Whether to mask source words for training") parser.add_argument('--skipgram_prb', type=float, default=0.0, help='prob of ngram mask') parser.add_argument('--skipgram_size', type=int, default=1, help='the max size of ngram mask') parser.add_argument('--mask_whole_word', action='store_true', help="Whether masking a whole word.") parser.add_argument('--do_l2r_training', action='store_true', help="Whether to do left to right training") parser.add_argument( '--has_sentence_oracle', action='store_true', help="Whether to have sentence level oracle for training. " "Only useful for summary generation") parser.add_argument('--max_position_embeddings', type=int, default=None, help="max position embeddings") parser.add_argument('--relax_projection', action='store_true', help="Use different projection layers for tasks.") parser.add_argument('--ffn_type', default=0, type=int, help="0: default mlp; 1: W((Wx+b) elem_prod x);") parser.add_argument('--num_qkv', default=0, type=int, help="Number of different <Q,K,V>.") parser.add_argument('--seg_emb', action='store_true', help="Using segment embedding for self-attention.") parser.add_argument( '--s2s_special_token', action='store_true', help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.") parser.add_argument('--s2s_add_segment', action='store_true', help="Additional segmental for the encoder of S2S.") parser.add_argument( '--s2s_share_segment', action='store_true', help= "Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment)." ) parser.add_argument('--pos_shift', action='store_true', help="Using position shift for fine-tuning.") args = parser.parse_args() assert Path( args.model_recover_path).exists(), "--model_recover_path doesn't exist" args.output_dir = args.output_dir.replace('[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) args.log_dir = args.log_dir.replace('[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.log_dir, exist_ok=True) handler = logging.FileHandler(os.path.join(args.log_dir, "train.log"), encoding='UTF-8') handler.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) console = logging.StreamHandler() console.setLevel(logging.DEBUG) logger.addHandler(handler) logger.addHandler(console) json.dump(args.__dict__, open(os.path.join(args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs dist.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) #Random Seed #torch.backends.cudnn.enabled = False #torch.backends.cudnn.benchmark = False #torch.backends.cudnn.deterministic = True # if n_gpu > 0: # torch.cuda.manual_seed_all(args.seed) if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) if args.max_position_embeddings: tokenizer.max_len = args.max_position_embeddings data_tokenizer = WhitespaceTokenizer( ) if args.tokenized_input else tokenizer if args.local_rank == 0: dist.barrier() #Data process pipelines bi_uni_pipeline = [ seq2seq_loader.Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_source_words=args.mask_source_words, skipgram_prb=args.skipgram_prb, skipgram_size=args.skipgram_size, mask_whole_word=args.mask_whole_word, mode="s2s", has_oracle=args.has_sentence_oracle, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift) ] C_bi_uni_pipeline = [ seq2seq_loader.C_Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_source_words=args.mask_source_words, skipgram_prb=args.skipgram_prb, skipgram_size=args.skipgram_size, mask_whole_word=args.mask_whole_word, mode="s2s", has_oracle=args.has_sentence_oracle, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift) ] ks_predict_bi_uni_pipeline = [ seq2seq_loader.Preprocess4Seq2seq_predict( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_source_words=args.mask_source_words, skipgram_prb=args.skipgram_prb, skipgram_size=args.skipgram_size, mask_whole_word=args.mask_whole_word, mode="s2s", has_oracle=args.has_sentence_oracle, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift) ] if args.do_train: print("Loading QKR Train Dataset", args.data_dir) file_oracle = None if args.has_sentence_oracle: file_oracle = os.path.join(args.data_dir, 'train.oracle') fn_src = os.path.join(args.data_dir, args.src_file if args.src_file else 'train.src') fn_tgt = os.path.join(args.data_dir, args.tgt_file if args.tgt_file else 'train.tgt') fn_check = os.path.join(args.data_dir, args.check_file) train_dataset = seq2seq_loader.C_Seq2SeqDataset( fn_src, fn_tgt, fn_check, args.train_batch_size, data_tokenizer, args.max_seq_length, file_oracle=file_oracle, bi_uni_pipeline=C_bi_uni_pipeline) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset, replacement=False) _batch_size = args.train_batch_size else: train_sampler = DistributedSampler(train_dataset) _batch_size = args.train_batch_size // dist.get_world_size() train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=_batch_size, sampler=train_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) print("Loading KS Train Dataset", args.data_dir) ks_fn_src = os.path.join(args.data_dir, args.ks_src_file) ks_fn_tgt = os.path.join(args.data_dir, args.ks_tgt_file) ks_train_dataset = seq2seq_loader.Seq2SeqDataset( ks_fn_src, ks_fn_tgt, args.train_batch_size, data_tokenizer, args.max_seq_length, file_oracle=file_oracle, bi_uni_pipeline=bi_uni_pipeline) if args.local_rank == -1: ks_train_sampler = RandomSampler(ks_train_dataset, replacement=False) _batch_size = args.train_batch_size else: ks_train_sampler = DistributedSampler(ks_train_dataset) _batch_size = args.train_batch_size // dist.get_world_size() ks_train_dataloader = torch.utils.data.DataLoader( ks_train_dataset, batch_size=_batch_size, sampler=ks_train_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) t_total = int( len(train_dataloader) * args.num_train_epochs / args.gradient_accumulation_steps) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model cls_num_labels = 2 type_vocab_size = 6 + ( 1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2 num_sentlvl_labels = 2 if args.has_sentence_oracle else 0 relax_projection = 4 if args.relax_projection else 0 if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() #Recover model if args.model_recover_path: logger.info(" ** ** * Recover model: %s ** ** * ", args.model_recover_path) model_recover = torch.load(args.model_recover_path, map_location='cpu') global_step = 0 mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[S2S_SOS]"]) model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, num_rel=0, type_vocab_size=type_vocab_size, config_path=args.config_path, task_idx=3, num_sentlvl_labels=num_sentlvl_labels, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, relax_projection=relax_projection, new_pos_ids=args.new_pos_ids, ffn_type=args.ffn_type, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, num_qkv=args.num_qkv, seg_emb=args.seg_emb, mask_word_id=mask_word_id, search_beam_size=5, length_penalty=0, eos_id=eos_word_ids, sos_id=sos_word_id, forbid_duplicate_ngrams=True, forbid_ignore_set=None, mode="s2s") if args.local_rank == 0: dist.barrier() if args.fp16: model.half() if args.fp32_embedding: model.bert.embeddings.word_embeddings.float() model.bert.embeddings.position_embeddings.float() model.bert.embeddings.token_type_embeddings.float() model.to(device) model.tmp_bert_emb.word_embeddings.weight = torch.nn.Parameter( model.bert.embeddings.word_embeddings.weight.clone()) model.tmp_bert_emb.token_type_embeddings.weight = torch.nn.Parameter( model.bert.embeddings.token_type_embeddings.weight.clone()) model.tmp_bert_emb.position_embeddings.weight = torch.nn.Parameter( model.bert.embeddings.position_embeddings.weight.clone()) model.mul_bert_emb.word_embeddings.weight = torch.nn.Parameter( model.bert.embeddings.word_embeddings.weight.clone()) model.mul_bert_emb.token_type_embeddings.weight = torch.nn.Parameter( model.bert.embeddings.token_type_embeddings.weight.clone()) model.mul_bert_emb.position_embeddings.weight = torch.nn.Parameter( model.bert.embeddings.position_embeddings.weight.clone()) if args.local_rank != -1: try: from torch.nn.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("DistributedDataParallel") model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = DataParallelImbalance(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from pytorch_bert.optimization_fp16 import FP16_Optimizer_State from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer_State(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer_State(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if args.optim_recover_path is not None: logger.info(" ** ** * Recover optimizer from : {} ** ** * ".format( args.optim_recover_path)) optim_recover = torch.load(args.optim_recover_path, map_location='cpu') if hasattr(optim_recover, 'state_dict'): optim_recover = optim_recover.state_dict() optimizer.load_state_dict(optim_recover) if args.loss_scale == 0: logger.info( " ** ** * Recover optimizer: dynamic_loss_scale ** ** * ") optimizer.dynamic_loss_scale = True #logger.info(" ** ** * CUDA.empty_cache() ** ** * ") torch.cuda.empty_cache() # ################# TRAIN ############################ # if args.do_train: max_F1 = 0 best_step = 0 logger.info(" ** ** * Running training ** ** * ") logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) model.train() start_epoch = 1 for i_epoch in trange(start_epoch, start_epoch + 1, desc="Epoch", disable=args.local_rank not in (-1, 0)): if args.local_rank != -1: train_sampler.set_epoch(i_epoch) step = 0 for batch, ks_batch in zip(train_dataloader, ks_train_dataloader): # ################# E step + M step + Mutual Information Loss ############################ # batch = [ t.to(device) if t is not None else None for t in batch ] input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, tgt_pos, labels, ks_labels, check_ids = batch oracle_pos, oracle_weights, oracle_labels = None, None, None loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv, tgt_pos=tgt_pos, labels=labels.half(), ks_labels=ks_labels, check_ids=check_ids) masked_lm_loss, next_sentence_loss, KL_loss, Mutual_loss, Golden_loss, predict_kl_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. masked_lm_loss = masked_lm_loss.mean() next_sentence_loss = next_sentence_loss.mean() Mutual_loss = Mutual_loss.mean() Golden_loss = Golden_loss.mean() KL_loss = KL_loss.mean() predict_kl_loss = predict_kl_loss.mean() loss = masked_lm_loss + next_sentence_loss + KL_loss + predict_kl_loss + Mutual_loss + Golden_loss logger.info("In{}step, masked_lm_loss:{}".format( step, masked_lm_loss)) logger.info("In{}step, KL_loss:{}".format(step, KL_loss)) logger.info("In{}step, Mutual_loss:{}".format( step, Mutual_loss)) logger.info("In{}step, Golden_loss:{}".format( step, Golden_loss)) logger.info("In{}step, predict_kl_loss:{}".format( step, predict_kl_loss)) logger.info("******************************************* ") # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion_step / t_total) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # ################# Knowledge Selection Loss ############################ # if random.randint(0, 4) == 0: ks_batch = [ t.to(device) if t is not None else None for t in ks_batch ] input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, _, labels, ks_labels = ks_batch oracle_pos, oracle_weights, oracle_labels = None, None, None loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv, labels=labels, ks_labels=ks_labels, train_ks=True) ks_loss, _ = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. ks_loss = ks_loss.mean() loss = ks_loss logger.info("In{}step, ks_loss:{}".format(step, ks_loss)) logger.info("******************************************* ") # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion_step / t_total) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() step += 1 ###################### Eval Every 5000 Step ############################ # if (global_step + 1) % 5000 == 0: next_i = 0 model.eval() # Know Rank Stage logger.info(" ** ** * DEV Know Selection Begin ** ** * ") with open(os.path.join(args.data_dir, args.predict_input_file), "r", encoding="utf-8") as file: src_file = file.readlines() with open(os.path.join(args.data_dir, "train_tgt_pad.empty"), "r", encoding="utf-8") as file: tgt_file = file.readlines() with open(os.path.join(args.data_dir, args.predict_output_file), "w", encoding="utf-8") as out: while next_i < len(src_file): batch_src = src_file[next_i:next_i + args.eval_batch_size] batch_tgt = tgt_file[next_i:next_i + args.eval_batch_size] next_i += args.eval_batch_size ex_list = [] for src, tgt in zip(batch_src, batch_tgt): src_tk = data_tokenizer.tokenize(src.strip()) tgt_tk = data_tokenizer.tokenize(tgt.strip()) ex_list.append((src_tk, tgt_tk)) batch = [] for idx in range(len(ex_list)): instance = ex_list[idx] for proc in ks_predict_bi_uni_pipeline: instance = proc(instance) batch.append(instance) batch_tensor = seq2seq_loader.batch_list_to_batch_tensors( batch) batch = [ t.to(device) if t is not None else None for t in batch_tensor ] input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx = batch predict_bleu = args.predict_bleu * torch.ones( [input_ids.shape[0]], device=input_ids.device) oracle_pos, oracle_weights, oracle_labels = None, None, None with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv, labels=predict_bleu, train_ks=True) logits = torch.nn.functional.softmax(logits, dim=1) labels = logits[:, 1].cpu().numpy() for i in range(len(labels)): line = batch_src[i].strip() line += "\t" line += str(labels[i]) out.write(line) out.write("\n") data_path = os.path.join(args.data_dir, "qkr_dev.ks_score.tk") src_path = os.path.join(args.data_dir, "qkr_dev.src.tk") src_out_path = os.path.join(args.data_dir, "rank_qkr_dev.src.tk") tgt_path = os.path.join(args.data_dir, "qkr_dev.tgt") knowledge_selection(data_path, src_path, src_out_path) logger.info(" ** ** * DEV Know Selection End ** ** * ") # Decode Stage logger.info(" ** ** * Dev Decode Begin ** ** * ") with open(src_out_path, encoding="utf-8") as file: dev_src_lines = file.readlines() with open(tgt_path, encoding="utf-8") as file: golden_response_lines = file.readlines() decode_result = decode_batch(model, dev_src_lines) logger.info(" ** ** * Dev Decode End ** ** * ") # Compute dev F1 assert len(decode_result) == len(golden_response_lines) C_F1 = f_one(decode_result, golden_response_lines)[0] logger.info( "** ** * Current F1 is {} ** ** * ".format(C_F1)) if C_F1 < max_F1: logger.info( "** ** * Current F1 is lower than Previous F1. So Stop Training ** ** * " ) logger.info( "** ** * The best model is {} ** ** * ".format( best_step)) break else: max_F1 = C_F1 best_step = step logger.info( "** ** * Current F1 is larger than Previous F1. So Continue Training ** ** * " ) # Save trained model if (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info( "** ** * Saving fine-tuned model and optimizer ** ** * " ) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "model.{}_{}.bin".format(i_epoch, global_step)) torch.save(model_to_save.state_dict(), output_model_file) output_optim_file = os.path.join( args.output_dir, "optim.bin") torch.save(optimizer.state_dict(), output_optim_file) #logger.info(" ** ** * CUDA.empty_cache() ** ** * ") torch.cuda.empty_cache() # ################# Predict ############################ # if args.do_predict: bi_uni_pipeline = [ seq2seq_loader.Preprocess4Seq2seq_predict( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_source_words=args.mask_source_words, skipgram_prb=args.skipgram_prb, skipgram_size=args.skipgram_size, mask_whole_word=args.mask_whole_word, mode="s2s", has_oracle=args.has_sentence_oracle, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift) ] next_i = 0 model.eval() with open(os.path.join(args.data_dir, args.predict_input_file), "r", encoding="utf-8") as file: src_file = file.readlines() with open("train_tgt_pad.empty", "r", encoding="utf-8") as file: tgt_file = file.readlines() with open(os.path.join(args.data_dir, args.predict_output_file), "w", encoding="utf-8") as out: logger.info("** ** * Continue knowledge ranking ** ** * ") for next_i in tqdm( range(len(src_file) // args.eval_batch_size + 1)): #while next_i < len(src_file): batch_src = src_file[next_i * args.eval_batch_size:(next_i + 1) * args.eval_batch_size] batch_tgt = tgt_file[next_i * args.eval_batch_size:(next_i + 1) * args.eval_batch_size] #next_i += args.eval_batch_size ex_list = [] for src, tgt in zip(batch_src, batch_tgt): src_tk = data_tokenizer.tokenize(src.strip()) tgt_tk = data_tokenizer.tokenize(tgt.strip()) ex_list.append((src_tk, tgt_tk)) batch = [] for idx in range(len(ex_list)): instance = ex_list[idx] for proc in bi_uni_pipeline: instance = proc(instance) batch.append(instance) batch_tensor = seq2seq_loader.batch_list_to_batch_tensors( batch) batch = [ t.to(device) if t is not None else None for t in batch_tensor ] input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx = batch predict_bleu = args.predict_bleu * torch.ones( [input_ids.shape[0]], device=input_ids.device) oracle_pos, oracle_weights, oracle_labels = None, None, None with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv, labels=predict_bleu, train_ks=True) logits = torch.nn.functional.softmax(logits, dim=1) labels = logits[:, 1].cpu().numpy() for i in range(len(labels)): line = batch_src[i].strip() line += "\t" line += str(labels[i]) out.write(line) out.write("\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") ## Required parameters parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--train_ans_file", default=None, type=str, help="SQuAD answer for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=32, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--restore', default=False) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # torch.backends.cudnn.benchmark = True n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) == False: # raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) import pickle as cPickle train_examples = None num_train_steps = None if args.do_train: raw_test_data = open(args.predict_file, mode='r') raw_train_data = open(args.train_file, mode='r') if os.path.exists("train_file_baseline.pkl") and False: train_examples = cPickle.load( open("train_file_baseline.pkl", mode='rb')) else: ans_dict = {} with open(args.train_ans_file) as f: for line in f: line = line.split(',') ans_dict[line[0]] = int(line[1]) train_examples = read_chid_examples(raw_train_data, is_training=True, ans_dict=ans_dict) cPickle.dump(train_examples, open("newtrain_file_baseline.pkl", mode='wb')) #tt = len(train_examples) // 2 #train_examples = train_examples[:tt] logger.info("train examples {}".format(len(train_examples))) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model bert_config = BertConfig.from_json_file(args.bert_config_file) tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) model = BertForCloze(bert_config, num_choices=10) if args.init_checkpoint is not None: logger.info('load bert weight') state_dict = torch.load(args.init_checkpoint, map_location='cpu') missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() # new_state_dict=state_dict.copy() # for kye ,value in state_dict.items(): # new_state_dict[kye.replace("bert","c_bert")]=value # state_dict=new_state_dict if metadata is not None: state_dict._metadata = metadata def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): # logger.info("name {} chile {}".format(name,child)) if child is not None: load(child, prefix + name + '.') load(model, prefix='' if hasattr(model, 'bert') else 'bert.') logger.info("missing keys:{}".format(missing_keys)) logger.info('unexpected keys:{}'.format(unexpected_keys)) logger.info('error msgs:{}'.format(error_msgs)) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex import amp from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate) # optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=t_total) # optimizer = RAdam(optimizer_grouped_parameters, # lr=args.learning_rate) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.restore: checkpoint = torch.load('amp_checkpoint.pt') model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) amp.load_state_dict(checkpoint['amp']) global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{0}_v{1}'.format( str(args.max_seq_length), str(4)) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_labels = torch.tensor([f.label for f in train_features], dtype=torch.long) all_option_ids = torch.tensor([f.option_ids for f in train_features], dtype=torch.long) all_positions = torch.tensor([f.position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_labels, all_option_ids, all_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, drop_last=True, pin_memory=True) loss_ini = 50 for _ in trange(int(args.num_train_epochs), desc="Epoch"): vizname = 'epoch' + str(_) viz = Visdom(env=str(vizname)) vis = Visdom(env='loss') via = Visdom(env='ac') model.train() model.zero_grad() epoch_itorator = tqdm(train_dataloader, disable=None) for step, batch in enumerate(epoch_itorator): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, labels, option_ids, positions = batch loss = model(input_ids, option_ids, segment_ids, input_mask, positions, labels) # print('att', loss.size()) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: # model, optimizer = amp.initialize(model, optimizer, opt_level= "O1") with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step # if args.fp16: # torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.) optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % 1000 == 0: logger.info("loss@{}:{}".format(step, loss.cpu().item())) steptotal = step + _ * int( len(train_examples) / args.train_batch_size) if (steptotal + 1) % 50 == 0: vis.line([loss.cpu().item()], [steptotal], win='train_loss', update='append') if (step + 1) % 50 == 0: viz.line([loss.cpu().item()], [step], win='train_loss', update='append') loss_total = str(loss.cpu().item()) print(loss_total) loss_ini = loss_total logger.info("loss:%f", loss.cpu().item()) logger.info("loss+:{}".format(loss.cpu().item())) raw_test_data_pre = open(args.predict_file, mode='r') eval_examples = read_chid_examples(raw_test_data_pre, is_training=False) # eval_examples=eval_examples[:100] eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_option_ids = torch.tensor( [f.option_ids for f in eval_features], dtype=torch.long) all_positions = torch.tensor([f.position for f in eval_features], dtype=torch.long) all_tags = torch.tensor([f.tag for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_option_ids, all_positions, all_tags) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() reader1 = pd.read_csv('dev_answer1.csv', usecols=[1], header=None) total_dev_loss = 0 all_results = {} logger.info("Start evaluating") for input_ids, input_mask, segment_ids, option_ids, positions, tags in \ tqdm(eval_dataloader, desc="Evaluating",disable=None): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) option_ids = option_ids.to(device) positions = positions.to(device) with torch.no_grad(): batch_logits, align = model(input_ids, option_ids, segment_ids, input_mask, positions) for i, tag in enumerate(tags): logits = batch_logits[i].detach().cpu().numpy() logit = [logits] logit = torch.tensor(logit) inum = int(tag) - 577157 dlabel = [reader1[1][inum]] dlabel = torch.tensor(dlabel) # loss_dev =FocalLoss(gamma=0.25) loss_dev = CrossEntropyLoss() dev_loss = loss_dev(logit, dlabel) total_dev_loss += dev_loss # for index1, dlabel in zip(reader1[0], reader1[1]): # if index1[6:11] == str(tag): # loss_dev =CrossEntropyLoss() # dev_loss = loss_dev(logits, dlabel) # total_dev_loss += dev_loss # continue ans = np.argmax(logits) all_results["#idiom%06d#" % tag] = ans predict_name = "ln11saprediction" + str(_) + ".csv" output_prediction_file = os.path.join(args.output_dir, predict_name) with open(output_prediction_file, "w") as f: for each in all_results: f.write(each + ',' + str(all_results[each]) + "\n") raw_test_data.close() pre_ac = 0 outputpre = 'output_model/' + predict_name reader2 = pd.read_csv(outputpre, usecols=[0, 1], header=None) for index2, ans2 in zip(reader2[0], reader2[1]): num = index2[6:12] num = int(num) - 577157 ans1 = reader1[1][num] if ans1 == ans2: pre_ac += 1 print(pre_ac) per = (pre_ac) / 23011 pernum = per * 100 logger.info("accuracy:%f", pernum) devlossmean = total_dev_loss / (23011 / 128) logger.info("devloss:%f", devlossmean) via.line([pernum], [_], win='accuracy', update='append') via.line([devlossmean], [_], win='loss', update='append') checkpoint = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'amp': amp.state_dict() } torch.save(checkpoint, 'checkpoint/amp_checkpoint.pt') outmodel = 'ln11samodel' + str(pernum) + '.bin' output_model_file = os.path.join(args.output_dir, outmodel) if args.do_train: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) raw_test_data.close() raw_train_data.close() # Save a trained model # output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # if args.do_train: # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): list1 = os.listdir('./output_model/') list1 = sorted( list1, key=lambda x: os.path.getmtime(os.path.join('./output_model/', x))) output_model_file = os.path.join(args.output_dir, list1[-1]) # output_model_file = os.path.join(args.output_dir, 'n11samodel77.33258007040111.bin') model_state_dict = torch.load(output_model_file) model = BertForCloze(bert_config, num_choices=10) model.load_state_dict(model_state_dict) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # raw_test_data_pre = open('./data/dev.txt', mode='r') raw_test_data_pre = open('./data/out.txt', mode='r') # raw_test_data_pre = open('new_test_data.txt', mode='r') eval_examples = read_chid_examples(raw_test_data_pre, is_training=False) # eval_examples=eval_examples[:100] eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_option_ids = torch.tensor([f.option_ids for f in eval_features], dtype=torch.long) all_positions = torch.tensor([f.position for f in eval_features], dtype=torch.long) all_tags = torch.tensor([f.tag for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_option_ids, all_positions, all_tags) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = {} all_results1 = {} all_results2 = {} # reader1 = pd.read_csv('test_ans.csv', usecols=[1], header=None) reader1 = pd.read_csv('./data/out_answer.csv', usecols=[1], header=None) #dev_answer1.csv # reader1 = pd.read_csv('dev_answer1.csv', usecols=[1], header=None) total_dev_loss = 0 logger.info("Start evaluating") for input_ids, input_mask, segment_ids, option_ids, positions, tags in \ tqdm(eval_dataloader, desc="Evaluating",disable=None): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) option_ids = option_ids.to(device) positions = positions.to(device) with torch.no_grad(): batch_logits, align = model(input_ids, option_ids, segment_ids, input_mask, positions) for i, tag in enumerate(tags): logits = batch_logits[i].detach().cpu().numpy() ans = np.argmax(logits) all_results["#idiom%06d#" % tag] = ans # matric = align[i].detach().cpu().numpy() # all_results1["#idiom%06d#" % tag] = matric[ans] # gr_logic = logits[:] # gr_logic = sorted(gr_logic, reverse=True) # all_results2["#idiom%06d#" % tag] = gr_logic output_prediction_file = os.path.join(args.output_dir, "testprediction.csv") # output_m_file = os.path.join(args.output_dir, "ealign.csv") # output_ma_file = os.path.join(args.output_dir, "sdmv.csv") with open(output_prediction_file, "w") as f: for each in all_results: f.write(each + ',' + str(all_results[each]) + "\n") # with open(output_m_file, "w") as f: # for each in all_results1: # f.write(each + ',' + str(all_results1[each]) + "\n") # with open(output_ma_file, "w") as f: # for each in all_results1: # f.write(each + ',' + str(all_results2[each]) + "\n") raw_test_data_pre.close() reader2 = pd.read_csv(output_prediction_file, usecols=[0, 1], header=None) pre_ac = 0 for index2, ans2 in zip(reader2[0], reader2[1]): num = index2[6:-1] # num = int(num)-1 # num = re.findall(r"\d+\.?\d*",index2) num = int(num) - 623377 # num = int(num) - 577157 ans1 = reader1[1][num] if ans1 == ans2: pre_ac += 1 print(pre_ac) # per = (pre_ac)/23011 # per = (pre_ac)/24948 per = (pre_ac) / 27704 pernum = per * 100 logger.info("accuracy:%f", pernum)