def main(): #os.environ["CUDA_VISIBLE_DEVICES"] = "0" torch.set_num_threads(1) parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=False, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval or not.") parser.add_argument("--eval_on", default="dev", help="Whether to run eval on the dev set or test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() #processors = FormationProcessor if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = FormationProcessor() tokenizer = BertTokenizer.from_pretrained( '/home/ypd-19-2/SpERT/model/bertbase-20210122T060007Z-001/bertbase') train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples() num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Prepare model config = BertConfig.from_pretrained(args.bert_model, num_labels=1, finetuning_task=args.task_name) model = BertForSequenceClassification.from_pretrained(args.bert_model, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 nb_tr_steps = 0 tr_loss = 0 #label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) #label_map = {i : label for i, label in enumerate(label_list,1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": 1 } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): loss_test = nn.L1Loss() if args.eval_on == "dev": eval_examples = processor.get_dev_examples() elif args.eval_on == "test": eval_examples = processor.get_test_examples() else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)[0] #logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2) input_mask = input_mask.to('cpu').numpy() batch_loss = loss_test(logits, label_ids) eval_loss += batch_loss y_true.append(label_ids) y_pred.append(logits) print('eval_loss') print(eval_loss / len(eval_dataloader))
def train(args, train_dataset, model, tokenizer): """ Train the model """ set_seed( args) # Added here for reproductibility (even between python 2 and 3) if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() processor = processors[args.task_name]() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if args.warmup_pct is None: scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) else: scheduler = WarmupLinearSchedule(optimizer, warmup_steps=math.floor( args.warmup_pct * t_total), t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # set_seed(args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0], mininterval=10, ncols=100) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet', 'bert_mc'] else None, # XLM don't use segment_ids 'labels': batch[3] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, processor, eval_split="dev") for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(parser): # Config args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) # data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model ptr_tokenizer = SentencepieceTokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token_to_idx = vocab_of_gluonnlp.token_to_idx model_config.vocab_size = len(token_to_idx) vocab = Vocabulary(token_to_idx=token_to_idx) print("len(token_to_idx): ", len(token_to_idx)) with open(model_dir / "token2idx_vocab.json", 'w', encoding='utf-8') as f: json.dump(token_to_idx, f, ensure_ascii=False, indent=4) # save vocab & tokenizer with open(model_dir / "vocab.pkl", 'wb') as f: pickle.dump(vocab, f) # load vocab & tokenizer with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) ner_formatter = NamedEntityRecognitionFormatter(vocab=vocab, tokenizer=tokenizer, maxlen=model_config.maxlen, model_dir=model_dir) # Train & Val Datasets cwd = Path.cwd() data_in = cwd / "data_in" train_data_dir = data_in / "NER-master" / "말뭉치 - 형태소_개체명" print("model_config.batch_size: ", model_config.batch_size) tr_clf_ds = NamedEntityRecognitionDataset(train_data_dir=train_data_dir, model_dir=model_dir) tr_clf_ds.set_transform_fn(transform_source_fn=ner_formatter.transform_source_fn, transform_target_fn=ner_formatter.transform_target_fn) tr_clf_dl = DataLoader(tr_clf_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False) # Model model = KobertBiGRUCRF(config=model_config, num_classes=len(tr_clf_ds.ner_to_index)) model.train() # optim train_examples_len = len(tr_clf_ds) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs t_total = len(tr_clf_dl) // model_config.gradient_accumulation_steps * model_config.epochs optimizer = AdamW(optimizer_grouped_parameters, lr=model_config.learning_rate, eps=model_config.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=model_config.warmup_steps, t_total=t_total) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') n_gpu = torch.cuda.device_count() # if n_gpu > 1: # model = torch.nn.DataParallel(model) model.to(device) # save tb_writer = SummaryWriter('{}/runs'.format(model_dir)) checkpoint_manager = CheckpointManager(model_dir) summary_manager = SummaryManager(model_dir) best_val_loss = 1e+10 best_train_acc = 0 # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(tr_clf_ds)) logger.info(" Num Epochs = %d", model_config.epochs) logger.info(" Instantaneous batch size per GPU = %d", model_config.batch_size) # logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", # args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", model_config.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_dev_acc, best_dev_loss = 0.0, 99999999999.0 best_steps = 0 model.zero_grad() set_seed() # Added here for reproductibility (even between python 2 and 3) # Train train_iterator = trange(int(model_config.epochs), desc="Epoch") for _epoch, _ in enumerate(train_iterator): epoch_iterator = tqdm(tr_clf_dl, desc="Iteration") # , disable=args.local_rank not in [-1, 0] epoch = _epoch for step, batch in enumerate(epoch_iterator): model.train() x_input, token_type_ids, y_real = map(lambda elm: elm.to(device), batch) log_likelihood, sequence_of_tags = model(x_input, token_type_ids, y_real) # loss: negative log-likelihood loss = -1 * log_likelihood if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if model_config.gradient_accumulation_steps > 1: loss = loss / model_config.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), model_config.max_grad_norm) tr_loss += loss.item() if (step + 1) % model_config.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 with torch.no_grad(): sequence_of_tags = torch.tensor(sequence_of_tags) print("sequence_of_tags: ", sequence_of_tags) print("y_real: ", y_real) print("loss: ", loss) print("(sequence_of_tags == y_real): ", (sequence_of_tags == y_real)) mb_acc = (sequence_of_tags == y_real).float()[y_real != vocab.PAD_ID].mean() tr_acc = mb_acc.item() tr_loss_avg = tr_loss / global_step tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc} # if step % 50 == 0: print('epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'.format(epoch + 1, global_step, tr_summary['loss'], tr_summary['acc'])) if model_config.logging_steps > 0 and global_step % model_config.logging_steps == 0: # Log metrics if model_config.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well pass tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / model_config.logging_steps, global_step) logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss) / model_config.logging_steps), str(global_step)) logging_loss = tr_loss if model_config.save_steps > 0 and global_step % model_config.save_steps == 0: # Save model checkpoint output_dir = os.path.join(model_config.output_dir, 'epoch-{}'.format(epoch + 1)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("Saving model checkpoint to %s", output_dir) state = {'global_step': global_step + 1, 'model_state_dict': model.state_dict(), 'opt_state_dict': optimizer.state_dict()} summary = {'train': tr_summary} summary_manager.update(summary) summary_manager.save('summary.json') is_best = tr_acc >= best_train_acc # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야) # Save if is_best: best_train_acc = tr_acc checkpoint_manager.save_checkpoint(state, 'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1, global_step, tr_acc)) else: torch.save(state, os.path.join(output_dir, 'model-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1, global_step, tr_acc))) tb_writer.close() logger.info(" global_step = %s, average loss = %s", global_step, tr_loss / global_step) return global_step, tr_loss / global_step, best_steps
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: random_number = np.random.randint(10000) tb_writer = SummaryWriter(log_dir='./imdb_runs/bert_' + str(random_number)) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_dataloader = sample_loader( train_dataset, batch_size=args.train_batch_size, k=args.k, n_classes=2, seed=args.seed, pos_sampling_ratio=args.pos_sampling_ratio) # train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // len(train_dataloader) + 1 else: t_total = len(train_dataloader) * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) N = 14 # for bert-base embeddings + 12 layers + classifier for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): if global_step % args.layer_step == 0: step = global_step // args.layer_step layer_sep = max(min(int(N - (step % N + 1)), N - 1), 1) # 1, ..., 13 layer_list_feature = [ '.' + str(i) + '.' for i in range(layer_sep) ] # update layer separating point optimizer_grouped_parameters_feature = [{ 'params': [ p for (n, p) in model.named_parameters() if any( l in n for l in ['embeddings'] + layer_list_feature) and not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for (n, p) in model.named_parameters() if any(l in n for l in ['embeddings'] + layer_list_feature) and any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer_grouped_parameters_classifier = [{ 'params': [ p for (n, p) in model.named_parameters() if not any(l in n for l in ['embeddings'] + layer_list_feature) and not any( nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for (n, p) in model.named_parameters() if not any(l in n for l in ['embeddings'] + layer_list_feature) and any( nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer_feature = AdamW(optimizer_grouped_parameters_feature, lr=args.learning_rate, eps=args.adam_epsilon) optimizer_classifier = AdamW( optimizer_grouped_parameters_classifier, lr=args.learning_rate, eps=args.adam_epsilon) # scheduler = WarmupLinearSchedule(optimizer_feature, warmup_steps=args.warmup_steps, t_total=t_total) # begin training model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': None } '''Initialize''' if epoch == 0 and step == 0: ouputs = model(**inputs) with torch.no_grad(): logits_1 = ouputs[0] logits_2 = ouputs[0] '''Update the model''' # scheduler_feature.step() # Update learning rate schedule # scheduler_classifier.step() # update classifier optimizer_classifier.step() with torch.no_grad(): ouputs = model(**inputs) logits_1 = ouputs[0] # update feature optimizer_feature.step() ouputs = model(**inputs) logits_2 = ouputs[0] model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) # tb_writer.add_scalar('lr', scheduler_feature.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) w = weight_schedule(global_step, ramp_up_epochs=t_total // 3, ramp_down_epochs=t_total // 10, total_epochs=t_total, max_val=args.max_val, mult=-5., mult_down=-7., n_labeled=args.k, n_samples=75000) tb_writer.add_scalar('layer_sep', layer_sep, global_step) tb_writer.add_scalar('w', w, global_step) w = torch.autograd.Variable(torch.FloatTensor([w]).cuda(), requires_grad=False) '''Calculate the loss''' loss, sup_loss, unsup_loss, nbsup = gul_loss( logits_2, logits_1, w, batch[3]) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training '''Calculate the gradients''' if args.fp16: raise NotImplementedError() else: loss.backward(retain_graph=True) torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() tb_writer.add_scalar('total_loss', loss.item(), global_step) tb_writer.add_scalar('sup_loss', sup_loss.item(), global_step) tb_writer.add_scalar('unsup_loss', unsup_loss.item(), global_step) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser("") parser.add_argument("--model", type=str, default='') parser.add_argument("--resume", action='store_true') parser.add_argument("--eval", action='store_true') parser.add_argument("--batch_size", type=int, default=CFG.batch_size) parser.add_argument("--nepochs", type=int, default=CFG.num_train_epochs) parser.add_argument("--wsteps", type=int, default=CFG.warmup_steps) parser.add_argument("--nlayers", type=int, default=CFG.num_hidden_layers) parser.add_argument("--nahs", type=int, default=CFG.num_attention_heads) parser.add_argument("--seed", type=int, default=7) parser.add_argument("--lr", type=float, default=CFG.learning_rate) parser.add_argument("--dropout", type=float, default=CFG.dropout) parser.add_argument("--types", nargs='+', type=str, default=['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN'], help='3JHC,2JHC,1JHC,3JHH,2JHH,3JHN,2JHN,1JHN') parser.add_argument("--train_file", default="train_mute_cp") parser.add_argument("--test_file", default="test_mute_cp") parser.add_argument("--pseudo_path", default="") parser.add_argument("--pseudo", action='store_true') parser.add_argument("--gen_pseudo", action='store_true') parser.add_argument("--use_all", action='store_true') parser.add_argument("--structure_file", default="structures_mu") parser.add_argument("--contribution_file", default="scalar_coupling_contributions") args = parser.parse_args() print(args) CFG.batch_size=args.batch_size CFG.num_train_epochs=args.nepochs CFG.warmup_steps=args.wsteps CFG.num_hidden_layers=args.nlayers CFG.num_attention_heads=args.nahs CFG.learning_rate=args.lr CFG.dropout=args.dropout CFG.seed = args.seed print(CFG.__dict__) random.seed(CFG.seed) np.random.seed(CFG.seed) torch.manual_seed(CFG.seed) #if not args.eval: if True: train_df = load_csv(args.train_file) structures_df = load_csv(args.structure_file) structures_df[['x', 'y', 'z']] -= structures_df.groupby('molecule_name')[['x', 'y', 'z']].transform('mean') contributions_df = load_csv(args.contribution_file) train_df = train_df.merge(contributions_df, how='left') train_df = normalize_cols(train_df, ['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) train_df = add_extra_features(train_df, structures_df) train_df = train_df.fillna(1e08) n_mols = train_df['molecule_name'].nunique() train_df, valid_df = train_test_split(train_df, 5000 ) # only molecules with the args.types print(train_df['molecule_name'].nunique()) mol_names_with_at = train_df[train_df['type'].isin(args.types)]['molecule_name'].unique() train_df = train_df[train_df['molecule_name'].isin(mol_names_with_at)].reset_index(drop=True) print(train_df['molecule_name'].nunique()) # Print the 5 rows of valid_df to verify whether the valid_df is the same as the previous experiment. print(valid_df.head(5)) if args.pseudo: test_df = load_csv(args.test_file) logger.info(f'loading dataset - {args.pseudo_path} ...') test_pseudo_df = pd.read_csv(args.pseudo_path) #mol_names_jhn = train_df[test_df['type'].isin(['1JHN', '2JHN', '3JHN'])]['molecule_name'].unique() #test_df = test_df[test_df['molecule_name'].isin(mol_names_jhn)].reset_index(drop=True) test_df = add_extra_features(test_df, structures_df) test_df = test_df.set_index('id') test_pseudo_df = test_pseudo_df.set_index('id') test_df[['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']] = test_pseudo_df[['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']] test_df = test_df.reset_index() #test_df = normalize_target(test_df) test_df = normalize_cols(test_df, ['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) #test_df = test_df.assign(fc=1e08, sd=1e08, pso=1e08, dso=1e08) train_df['weight'] = 1.0 valid_df['weight'] = 1.0 test_df['weight'] = 1.0 n_mols = test_df['molecule_name'].nunique() train_df = train_df.append(test_df).reset_index(drop=True) else: train_df['weight'] = 1.0 valid_df['weight'] = 1.0 if args.use_all: train_df = train_df.append(valid_df) print(f' n_train:{len(train_df)}, n_valid:{len(valid_df)}') config = BertConfig( 3, # not used hidden_size=CFG.hidden_size, num_hidden_layers=CFG.num_hidden_layers, num_attention_heads=CFG.num_attention_heads, intermediate_size=CFG.intermediate_size, hidden_dropout_prob=CFG.dropout, attention_probs_dropout_prob=CFG.dropout, ) model = cust_model.SelfAttn(config) if args.model != "": print("=> loading checkpoint '{}'".format(args.model)) checkpoint = torch.load(args.model) CFG.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.model, checkpoint['epoch'])) model.cuda() def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) print('parameters: ', count_parameters(model)) n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) # to produce the submission.csv if args.eval: test_df = load_csv(args.test_file) structures_df = load_csv(args.structure_file) structures_df[['x', 'y', 'z']] -= structures_df.groupby('molecule_name')[['x', 'y', 'z']].transform('mean') test_df = add_extra_features(test_df, structures_df) test_df = test_df.assign(fc=1e08, sd=1e08, pso=1e08, dso=1e08) test_df['scalar_coupling_constant'] = 0 test_df['weight'] = 1.0 test_db = db.MolDB(test_df, CFG.max_seq_length) test_loader = DataLoader( test_db, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers) res_df = validate(test_loader, model, args.types) res_df = unnormalize_cols(res_df, cols=['fc', 'sd', 'pso', 'dso']) res_df = unnormalize_target(res_df, 'prediction1') if args.gen_pseudo: res_df['scalar_coupling_constant'] = res_df['prediction1'] res_df = res_df[res_df['id']>-1].sort_values('id') res_df[['id', 'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']].to_csv(f'pseudo_{CFG.seed}.csv', index=False) return res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1) res_df['prediction']= res_df[['prediction1','prediction4']].mean(1) res_df['scalar_coupling_constant'] = res_df['prediction'] res_df = res_df[res_df['id']>-1].sort_values('id') os.makedirs('output', exist_ok=True) res_df[['id', 'scalar_coupling_constant']].to_csv(f'output/submission_{CFG.seed}.csv', index=False) return train_db = db.MolDB(train_df, CFG.max_seq_length) print('preloading dataset ...') train_db = db.MolDB_FromDB(train_db, 10) valid_db = db.MolDB(valid_df, CFG.max_seq_length) num_train_optimization_steps = int( len(train_db) / CFG.batch_size / CFG.gradient_accumulation_steps) * (CFG.num_train_epochs-CFG.start_epoch) print('num_train_optimization_steps', num_train_optimization_steps) train_loader = DataLoader( train_db, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True) val_loader = DataLoader( valid_db, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=CFG.learning_rate, weight_decay=CFG.weight_decay, ) scheduler = WarmupLinearSchedule(optimizer, CFG.warmup_steps, t_total=num_train_optimization_steps ) def get_lr(): return scheduler.get_lr()[0] if args.model != "": if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) #for param_group in optimizer.param_groups: # param_group['lr'] = CFG.learning_rate mae_log_df = checkpoint['mae_log'] del checkpoint else: mae_log_df = pd.DataFrame(columns=(['EPOCH']+['LR']+args.types + ['OVERALL']) ) os.makedirs('log', exist_ok=True) res_df = validate(val_loader, model, args.types) res_df = unnormalize_cols(res_df, cols=['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) res_df = unnormalize_target(res_df, 'prediction1') res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1) res_df['prediction']= res_df[['prediction1','prediction4']].mean(1) res_df.to_csv(f'log/valid_df_{"_".join(args.types)}.csv', index=False) overall_mae, maes = metric(res_df, args.types) print(overall_mae, maes) curr_lr = get_lr() print(f'initial learning rate:{curr_lr}') for epoch in range(CFG.start_epoch, CFG.num_train_epochs): # train for one epoch #print(adjust_learning_rate(optimizer, epoch)) train(train_loader, model, optimizer, epoch, args.types, scheduler) if epoch % CFG.test_freq == 0: res_df = validate(val_loader, model, args.types) res_df = unnormalize_cols(res_df, cols=['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) res_df = unnormalize_target(res_df, 'prediction1') res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1) res_df['prediction']= res_df[['prediction1','prediction4']].mean(1) res_df.to_csv(f'log/valid_df_{"_".join(args.types)}.csv', index=False) overall_mae, maes = metric(res_df, args.types) # write log file mae_row = dict([(typ, [mae]) for typ, mae in maes.items() if typ in args.types]) mae_row.update({'EPOCH':(epoch),'OVERALL':overall_mae, 'LR':curr_lr}) mae_log_df = mae_log_df.append(pd.DataFrame(mae_row), sort=False) print(mae_log_df.tail(20)) mae_log_df.to_csv(f'log/{"_".join(args.types)}.csv', index=False) #scheduler.step(overall_mae) curr_lr = get_lr() print(f'set the learning_rate: {curr_lr}') # evaluate on validation set batch_size = CFG.batch_size pseudo_path = '' if not args.pseudo else '_' + args.pseudo_path curr_model_name = (f'b{batch_size}_l{config.num_hidden_layers}_' f'mh{config.num_attention_heads}_h{config.hidden_size}_' f'd{CFG.dropout}_' f'ep{epoch}_{"_".join(args.types)}_s{CFG.seed}{pseudo_path}.pt') model_to_save = model.module if hasattr(model, 'module') else model # Only save the cust_model it-self save_checkpoint({ 'epoch': epoch + 1, 'arch': 'transformer', 'state_dict': model_to_save.state_dict(), 'mae_log': mae_log_df, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, FINETUNED_MODEL_PATH, curr_model_name ) print('done')
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = range(int(args.num_train_epochs)) set_seed( args) # Added here for reproductibility (even between python 2 and 3) def _get_output(_batch): inputs = { 'input_ids': _batch[0], 'attention_mask': _batch[1], 'token_type_ids': _batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': _batch[3] } return model(**inputs) for e in train_iterator: epoch_iterator = tqdm( train_dataloader, desc=f"Epoch {e + 1}/{int(args.num_train_epochs)}", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) positive_scores = softmax(_get_output(batch[:4])[1], dim=1)[:, 1] negative_scores = softmax(_get_output(batch[4:])[1], dim=1)[:, 1] cross_entropy_loss = -torch.log(positive_scores) - torch.log( 1 - negative_scores) hinge_loss = torch.max( torch.tensor(0, dtype=torch.float).to(args.device), 1 - positive_scores + negative_scores) loss = (0.5 * cross_entropy_loss + 0.5 * hinge_loss).sum() if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if 0 < args.max_steps < global_step: epoch_iterator.close() break if 0 < args.max_steps < global_step: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default='/home/lsy2018/文本匹配/datapro/ubuntu/', type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default='bert', type=str, required=False, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default='/home/lsy2018/文本匹配/uncased_L-12_H-768_A-12', type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default='./result', type=str, required=False, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=10, type=int, help="") parser.add_argument("--lstm_hidden_size", default=512, type=int, help="") parser.add_argument("--lstm_layers", default=1, type=int, help="") parser.add_argument("--lstm_dropout", default=0.1, type=float, help="") parser.add_argument("--train_steps", default=200, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=1, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=2) # Prepare model model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) # if args.local_rank != -1: # try: # from apex.parallel import DistributedDataParallel as DDP # except ImportError: # raise ImportError( # "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") # # model = DDP(model) # elif args.n_gpu > 1: # model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.txt'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.do_eval and (step + 1) % ( args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.txt']: inference_labels = [] scores = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) ID1 = [x.sentence_ID1 for x in eval_examples] ID2 = [x.sentence_ID2 for x in eval_examples] # print(len(ID1),len(ID2)) # exit() all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 count = 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: # ID1_list_eachbatch = ID1[count*args.eval_batch_size:(count+1)*args.eval_batch_size] # ID2_list_eachbatch = ID2[count * args.eval_batch_size:(count + 1) * args.eval_batch_size] input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) scores.append(logits) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) scores = np.concatenate(scores, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) eval_mrr = compute_MRR(scores, gold_labels, ID1, ID2) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'mrr': eval_mrr, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) '''
def train(self, model, model_name, B, N_for_train, N_for_eval, K, Q, na_rate=0, learning_rate=1e-1, lr_step_size=20000, weight_decay=1e-5, train_iter=30000, val_iter=1000, val_step=2000, test_iter=3000, load_ckpt=None, save_ckpt=None, pytorch_optim=optim.SGD, bert_optim=False, warmup=True, warmup_step=300, grad_iter=1, fp16=False, pair=False, adv_dis_lr=1e-1, adv_enc_lr=1e-1): ''' model: a FewShotREModel instance model_name: Name of the model B: Batch size N: Num of classes for each batch K: Num of instances for each class in the support set Q: Num of instances for each class in the query set ckpt_dir: Directory of checkpoints learning_rate: Initial learning rate lr_step_size: Decay learning rate every lr_step_size steps weight_decay: Rate of decaying weight train_iter: Num of iterations of training val_iter: Num of iterations of validating val_step: Validate every val_step steps test_iter: Num of iterations of testing ''' print("Start training...") # Init if bert_optim: print('Use bert optim!') parameters_to_optimize = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] parameters_to_optimize = [ {'params': [p for n, p in parameters_to_optimize if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in parameters_to_optimize if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(parameters_to_optimize, lr=2e-5, correct_bias=False) if self.adv: optimizer_encoder = AdamW(parameters_to_optimize, lr=1e-5, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_step, t_total=train_iter) else: optimizer = pytorch_optim(model.parameters(), learning_rate, weight_decay=weight_decay) if self.adv: optimizer_encoder = pytorch_optim(model.parameters(), lr=adv_enc_lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=lr_step_size) if self.adv: optimizer_dis = pytorch_optim(self.d.parameters(), lr=adv_dis_lr) if load_ckpt: state_dict = self.__load_model__(load_ckpt)['state_dict'] own_state = model.state_dict() for name, param in state_dict.items(): if name not in own_state: continue own_state[name].copy_(param) start_iter = 0 else: start_iter = 0 if fp16: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model.train() if self.adv: self.d.train() # Training best_acc = 0 not_best_count = 0 # Stop training after several epochs without improvement. iter_loss = 0.0 iter_loss_dis = 0.0 iter_right = 0.0 iter_right_dis = 0.0 iter_sample = 0.0 for it in range(start_iter, start_iter + train_iter): if pair: batch, label = next(self.train_data_loader) if torch.cuda.is_available(): for k in batch: batch[k] = batch[k].cuda() label = label.cuda() logits, pred = model(batch, N_for_train, K, Q * N_for_train + na_rate * Q) else: support, query, label = next(self.train_data_loader) if torch.cuda.is_available(): for k in support: support[k] = support[k].cuda() for k in query: query[k] = query[k].cuda() label = label.cuda() logits, pred = model(support, query, N_for_train, K, Q * N_for_train + na_rate * Q) loss = model.loss(logits, label) / float(grad_iter) right = model.accuracy(pred, label) if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 10) else: loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 10) if it % grad_iter == 0: optimizer.step() scheduler.step() optimizer.zero_grad() # Adv part if self.adv: support_adv = next(self.adv_data_loader) if torch.cuda.is_available(): for k in support_adv: support_adv[k] = support_adv[k].cuda() features_ori = model.sentence_encoder(support) features_adv = model.sentence_encoder(support_adv) features = torch.cat([features_ori, features_adv], 0) total = features.size(0) dis_labels = torch.cat([torch.zeros((total//2)).long().cuda(), torch.ones((total//2)).long().cuda()], 0) dis_logits = self.d(features) loss_dis = self.adv_cost(dis_logits, dis_labels) _, pred = dis_logits.max(-1) right_dis = float((pred == dis_labels).long().sum()) / float(total) loss_dis.backward(retain_graph=True) optimizer_dis.step() optimizer_dis.zero_grad() optimizer_encoder.zero_grad() loss_encoder = self.adv_cost(dis_logits, 1 - dis_labels) loss_encoder.backward(retain_graph=True) optimizer_encoder.step() optimizer_dis.zero_grad() optimizer_encoder.zero_grad() iter_loss_dis += self.item(loss_dis.data) iter_right_dis += right_dis iter_loss += self.item(loss.data) iter_right += self.item(right.data) iter_sample += 1 if self.adv: sys.stdout.write('step: {0:4} | loss: {1:2.6f}, accuracy: {2:3.2f}%, dis_loss: {3:2.6f}, dis_acc: {4:2.6f}' .format(it + 1, iter_loss / iter_sample, 100 * iter_right / iter_sample, iter_loss_dis / iter_sample, 100 * iter_right_dis / iter_sample) +'\r') else: sys.stdout.write('step: {0:4} | loss: {1:2.6f}, accuracy: {2:3.2f}%'.format(it + 1, iter_loss / iter_sample, 100 * iter_right / iter_sample) +'\r') sys.stdout.flush() if (it + 1) % val_step == 0: acc = self.eval(model, B, N_for_eval, K, Q, val_iter, na_rate=na_rate, pair=pair) model.train() if acc > best_acc: print('Best checkpoint') torch.save({'state_dict': model.state_dict()}, save_ckpt) best_acc = acc iter_loss = 0. iter_loss_dis = 0. iter_right = 0. iter_right_dis = 0. iter_sample = 0. print("\n####################\n") print("Finish training " + model_name)
def main(): parser = argparse.ArgumentParser() parser = add_xlmr_args(parser) args = parser.parse_args() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = NerProcessor() label_list = processor.get_labels() num_labels = len(label_list) + 1 # add one for IGNORE label train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # preparing model configs hidden_size = 768 if 'base' in args.pretrained_path else 1024 # TODO: move this inside model.__init__ device = 'cuda' if (torch.cuda.is_available() and not args.no_cuda) else 'cpu' # creating model model = XLMRForTokenClassification(pretrained_path=args.pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=0.2, device=device) #-- dropout 0.2 model.to(device) no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) # freeze model if necessary if args.freeze_model: logger.info("Freezing XLM-R model...") for n, p in model.named_parameters(): if 'xlmr' in n and p.requires_grad: p.requires_grad = False if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, model.encode_word) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = create_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # getting validation samples val_examples = processor.get_dev_examples(args.data_dir) val_features = convert_examples_to_features(val_examples, label_list, args.max_seq_length, model.encode_word) val_data = create_dataset(val_features) best_val_f1 = 0.0 for _ in tqdm(range(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 tbar = tqdm(train_dataloader, desc="Iteration") model.train() for step, batch in enumerate(tbar): batch = tuple(t.to(device) for t in batch) input_ids, label_ids, l_mask, valid_ids, = batch loss = model(input_ids, label_ids, l_mask, valid_ids) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 tbar.set_description('Loss = %.4f' % (tr_loss / (step + 1))) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 logger.info("\nTesting on validation set...") f1, report = evaluate_model(model, val_data, label_list, args.eval_batch_size, device) if f1 > best_val_f1: best_val_f1 = f1 logger.info( "\nFound better f1=%.4f on validation set. Saving model\n" % (f1)) logger.info("%s\n" % (report)) torch.save( model.state_dict(), open(os.path.join(args.output_dir, 'model.pt'), 'wb')) else: logger.info("\nNo better F1 score: {}\n".format(f1)) else: # load a saved model state_dict = torch.load( open(os.path.join(args.output_dir, 'model.pt'), 'rb')) model.load_state_dict(state_dict) logger.info("Loaded saved model") model.to(device) if args.do_eval: if args.eval_on == "dev": eval_examples = processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = processor.get_test_examples(args.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, model.encode_word) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_data = create_dataset(eval_features) f1_score, report = evaluate_model(model, eval_data, label_list, args.eval_batch_size, device) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Writing results to file *****") writer.write(report) logger.info("Done.")
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) warmup_steps = args.warmup_steps if args.warmup_steps >= 1 else int(t_total * args.warmup_steps) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() # train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) train_iterator = range(int(args.num_train_epochs)) set_seed(args) # Added here for reproductibility (even between python 2 and 3) first_time = time.time() best_result = 0.0 for idx_epoch in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) epoch_iterator = train_dataloader preds = None out_label_ids = None for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) input_ids, attention_mask, token_type_ids, labels = batch[0], batch[1], batch[2], batch[3] inputs = {'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids if args.model_type in ['bert', 'xlnet'] \ and not args.no_segment else None, # XLM and RoBERTa don't use segment_ids 'labels': labels} outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() total_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if preds is None: preds = outputs[1].detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, outputs[1].detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, data_type="dev", prefix=global_step) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) # current loss cur_loss = (tr_loss - logging_loss) / args.logging_steps tb_writer.add_scalar('loss', cur_loss, global_step) logging_loss = tr_loss # print log log_string = "Job_{}:".format(args.job_id) log_string += " epoch={:<3d}".format(idx_epoch) log_string += " step={:<8d}".format(global_step) log_string += " batch={:<4d}".format(labels.shape[0]) log_string += " lr={:<10.7f}".format(scheduler.get_lr()[0]) log_string += " train_loss={:<8.5f}".format(cur_loss) log_string += " |g|={:<10.7f}".format(total_norm) # calculate accuracy if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(args.task_name, preds, out_label_ids) for key in sorted(result.keys()): log_string += " {}_{}={:<8.5f}".format("train", key, result[key]) log_string += " mins={:<9.2f}".format(float(time.time() - first_time) / 60) logger.info(log_string) preds = None out_label_ids = None if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank == -1 and not args.evaluate_during_training and args.evaluate_after_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, data_type="dev", prefix=global_step) metrics = result_for_sorting(args.task_name, results) if metrics >= best_result: best_result = metrics # Save model checkpoint output_dir = os.path.join(args.output_dir, 'best') if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(rank, args): #print('params: '," T_warm: ",T_warm," all_iteration: ",all_iteration," lr: ",lr) #writer = SummaryWriter('./model_snapshot_error') # cuda_list=range(cuda_num) #cuda_list=[2,1,0] #print('run') random.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) torch.set_num_threads(1) config = json.load(open(args.config_file, 'r', encoding="utf-8")) model = Model_Hotpot2(args, config) #model.network.to(cudaid) param_optimizer = list(model.network.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=config["training"]["learning_rate"]) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=config["training"]["warmup_proportion"], t_total=config["training"]["total_training_steps"]) #cudaid=dist.get_rank() cudaid = rank model.network.to(cudaid) args.device = cudaid #model.cuda(cudaid) accumulation_steps = 16 #model = nn.DataParallel(model, device_ids=cuda_list) print('rank: ', rank) accum_batch_loss = 1 #train_file='train_ms_roberta_plain_pair_sample_shuffle.txt' train_file = 'train_ms_transformer_xh_shuffle.txt' #train_file='train_ms_roberta.txt' #iterator=NewsIterator(batch_size=24, npratio=4) #for epoch in range(0,100): batch_t = 0 iteration = 0 #print('train...',cuda_list) pre_batch_t = 177796 epoch = 0 model.train() for epoch in range(0, 10): #while True: all_loss = 0 all_batch = 0 data_batch = utils.get_batch_dist(train_file, 8, rank) for batch in data_batch: #g,imp_index,label #batch=imp_index , g , label #print('batch: ',batch) batch_t += 1 #batch[0] = batch[0].to(torch.device('cuda:0')) # g=batch[0].to(torch.device('cuda:'+str(cudaid))) g = batch[0].to(torch.device('cuda:' + str(cudaid))) logit = model.network((g, batch[-1]), cudaid) label = batch[-1].cuda(cudaid) #print('logit: ',logit.shape) #pos_node_idx = [i for i in range(batch[2].size(0)) if batch[1][i].item() != -1] #print('????label:',batch[0].ndata['label']) #pos_node_idx=[i for i in range(batch[0].ndata['label'].size(0)) if batch[0].ndata['label'][i].item()!=-1 ] #print('pos_node_idx: ',pos_node_idx) # logit=logit[pos_node_idx].reshape(-1,2) logit = logit.reshape(-1, 2) #print('logit: ',logit.shape) loss = F.nll_loss( F.log_softmax( logit.view(-1, logit.size(-1)), dim=-1, dtype=torch.float32, ), label.view(-1), reduction='sum', #ignore_index=self.padding_idx, ) #sample_size=float(sample_size.sum()) #loss=loss.sum()/sample_size/math.log(2) loss = loss / len(batch[1]) / math.log(2) # sample_size=float(sample_size) # loss=loss/sample_size/math.log(2) #print(' batch_t: ',batch_t, ' epoch: ',epoch,' loss: ',float(loss)) accum_batch_loss += float(loss) all_loss += float(loss) all_batch += 1 loss = loss / accumulation_steps loss.backward() if (batch_t) % accumulation_steps == 0: #print('candidate_id: ',candidate_id) # total_norm=0 # for p in model.network.parameters(): # if p.grad==None: # print('error: ',index,p.size(),p.grad) # param_norm = p.grad.data.norm(2) # total_norm += param_norm.item() ** 2 # total_norm = total_norm ** (1. / 2) # total_clip_norm=0 # for p in model.network.parameters(): # if p.grad==None: # print('error: ',index,p.size(),p.grad) # param_norm = p.grad.data.norm(2) # total_clip_norm += param_norm.item() ** 2 # total_clip_norm = total_clip_norm ** (1. / 2) iteration += 1 #adjust_learning_rate(optimizer,iteration) average_gradients(model) torch.nn.utils.clip_grad_norm_(model.network.parameters(), 1.0) optimizer.step() scheduler.step() optimizer.zero_grad() #print(' batch_t: ',batch_t, ' iteration: ', iteration, ' epoch: ',epoch,' accum_batch_loss: ',accum_batch_loss/accumulation_steps, " total_norm: ", total_norm,' clip_norm: ',total_clip_norm) if rank == 0: print(' batch_t: ', batch_t, ' iteration: ', iteration, ' epoch: ', epoch, ' accum_batch_loss: ', accum_batch_loss / accumulation_steps) torch.save(model.network.state_dict(), './models/transformer_xh' + str(epoch) + '.pkl') accum_batch_loss = 0 dist.barrier()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default='/hdd/lujunyu/dataset/multi_turn_corpus/douban/', type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--task_name", default='ubuntu', type=str, required=False, help="The name of the task to train.") parser.add_argument( "--output_dir", default='/hdd/lujunyu/model/chatbert/douban_roberta_si_aug/', type=str, required=False, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--dialog_augmentation_path", default= '/hdd/lujunyu/dataset/multi_turn_corpus/douban/train_augment_3.txt', ## train_augment_3.txt type=str, help="Whether to use augmentation") ## Other parameters parser.add_argument( "--init_model_path", default='/hdd/lujunyu/dataset/bert/chinese_roberta_small_pytorch/', type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_test", default=True, action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--train_batch_size", default=500, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=100, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-6, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_steps", default=0.0, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=1e-3, type=float, help="weight_decay") parser.add_argument("--save_checkpoints_steps", default=10000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=10, help= "Number of updates steps to accumualte before performing a backward/update pass." ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_pretrained(args.init_model_path + 'config.json', num_labels=2) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): if args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.init_model_path + 'vocab.txt', do_lower_case=args.do_lower_case) if args.dialog_augmentation_path: train_dataset = DoubanDataset(file_path=args.dialog_augmentation_path, max_seq_length=args.max_seq_length, tokenizer=tokenizer) else: train_dataset = DoubanDataset(file_path=os.path.join( args.data_dir, "train.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer) eval_dataset = DoubanDataset(file_path=os.path.join( args.data_dir, "dev.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, sampler=RandomSampler(train_dataset), num_workers=9) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=args.eval_batch_size, sampler=SequentialSampler(eval_dataset), num_workers=9) model = BertForSequenceClassification.from_pretrained( '/hdd/lujunyu/model/chatbert/douban_roberta_si_aug_beifen/model.pt', config=bert_config) model.to(device) num_train_steps = None if args.do_train: num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare optimizer param_optimizer = list(model.named_parameters()) # remove pooler, which is not used thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_steps) else: optimizer = None scheduler = None if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 best_acc = 0.0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients scheduler.step() model.zero_grad() global_step += 1 if (step + 1) % args.save_checkpoints_steps == 0: ### Evaluate at the end of epoches model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy } output_eval_file = os.path.join(args.output_dir, "eval_results_dev.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) output_eval_file = os.path.join(args.output_dir, "logits_dev.txt") with open(output_eval_file, "w") as f: for i in range(len(logits_all)): for j in range(len(logits_all[i])): f.write(str(logits_all[i][j])) if j == len(logits_all[i]) - 1: f.write("\n") else: f.write(" ") ### Save the best checkpoint if best_acc < eval_accuracy: try: ### Remove 'module' prefix when using DataParallel state_dict = model.module.state_dict() except AttributeError: state_dict = model.state_dict() torch.save(state_dict, os.path.join(args.output_dir, "model.pt")) best_acc = eval_accuracy logger.info('Saving the best model in {}'.format( os.path.join(args.output_dir, "model.pt"))) model.train()
def train(data, model, args): if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(data.train_ids) // args.gradient_accumulation_steps) + 1 else: t_total = ( len(data.train_ids) // args.batch_size ) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=int(t_total*args.warmup_proportion), t_total=t_total) scheduler = WarmupConstantSchedule(optimizer, warmup_steps=int( t_total * args.warmup_proportion)) best_dev = -1 o_label = data.label_alphabet.get_index("O") for idx in range(args.num_train_epochs): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx, args.num_train_epochs)) instance_count = 0 batch_loss = 0 sample_loss = 0 total_loss = 0 random.shuffle(data.train_ids) model.train() model.zero_grad() batch_size = args.batch_size train_num = len(data.train_ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_ids[start:end] if not instance: continue model.zero_grad() input_ids, attention_mask, label_seq_tensor, loss_mask, crf_mask, scope = batchify( instance, args, o_label) loss, best_path = model.neg_log_likelihood(input_ids, attention_mask, label_seq_tensor, crf_mask, scope) instance_count += 1 sample_loss += loss.item() total_loss += loss.item() batch_loss += loss loss.backward() if args.use_clip: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if (batch_id + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() if end % 100 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print(" Instance: %s; Time: %.2fs; loss: %.4f" % (end, temp_cost, sample_loss)) sys.stdout.flush() sample_loss = 0 temp_time = time.time() temp_cost = temp_time - temp_start print(" Instance: %s; Time: %.2fs; loss: %.4f" % (end, temp_cost, sample_loss)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx, epoch_cost, train_num / epoch_cost, total_loss)) speed, acc, p, r, f, _ = evaluate(data, model, args, "test") dev_finish = time.time() dev_cost = dev_finish - epoch_finish current_score = f print( "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (dev_cost, speed, acc, p, r, f)) if current_score > best_dev: print("Exceed previous best f score:", best_dev) if not os.path.exists(args.param_stored_directory + args.dataset_name + "_param"): os.makedirs(args.param_stored_directory + args.dataset_name + "_param") model_name = "{}epoch_{}_f1_{}.model".format( args.param_stored_directory + args.dataset_name + "_param/", idx, current_score) # torch.save(model.state_dict(), model_name) best_dev = current_score gc.collect()
def __init__(self, params: dict, dataloader: Dataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module): logger.info('Initializing Distiller') self.params = params self.dump_path = params.dump_path self.multi_gpu = params.multi_gpu self.fp16 = params.fp16 self.student = student self.teacher = teacher self.dataloader = dataloader if self.params.n_gpu > 1: self.dataloader.split() self.get_iterator(seed=params.seed) self.temperature = params.temperature assert self.temperature > 0. self.alpha_ce = params.alpha_ce self.alpha_mlm = params.alpha_mlm self.alpha_mse = params.alpha_mse assert self.alpha_ce >= 0. assert self.alpha_mlm >= 0. assert self.alpha_mse >= 0. assert self.alpha_ce + self.alpha_mlm + self.alpha_mse > 0. self.mlm_mask_prop = params.mlm_mask_prop assert 0.0 <= self.mlm_mask_prop <= 1.0 assert params.word_mask + params.word_keep + params.word_rand == 1.0 self.pred_probs = torch.FloatTensor( [params.word_mask, params.word_keep, params.word_rand]) self.pred_probs = self.pred_probs.to( f'cuda:{params.local_rank}' ) if params.n_gpu > 0 else self.pred_probs self.token_probs = token_probs.to( f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs if self.fp16: self.pred_probs = self.pred_probs.half() self.token_probs = self.token_probs.half() self.epoch = 0 self.n_iter = 0 self.n_total_iter = 0 self.n_sequences_epoch = 0 self.total_loss_epoch = 0 self.last_loss = 0 self.last_loss_ce = 0 self.last_loss_mlm = 0 self.last_loss_mse = 0 self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean') self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) self.mse_loss_fct = nn.MSELoss(reduction='sum') logger.info('--- Initializing model optimizer') assert params.gradient_accumulation_steps >= 1 self.num_steps_epoch = int( len(self.dataloader) / params.batch_size) + 1 num_train_optimization_steps = int( self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1 warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': params.weight_decay }, { 'params': [ p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.0 }] logger.info( "------ Number of trainable parameters (student): %i" % sum([ p.numel() for p in self.student.parameters() if p.requires_grad ])) logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()])) self.optimizer = AdamW(optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)) self.scheduler = WarmupLinearSchedule( self.optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) if self.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) logger.info( f"Using fp16 training: {self.params.fp16_opt_level} level") self.student, self.optimizer = amp.initialize( self.student, self.optimizer, opt_level=self.params.fp16_opt_level) self.teacher = self.teacher.half() if self.multi_gpu: if self.fp16: from apex.parallel import DistributedDataParallel logger.info( "Using apex.parallel.DistributedDataParallel for distributed training." ) self.student = DistributedDataParallel(self.student) else: from torch.nn.parallel import DistributedDataParallel logger.info( "Using nn.parallel.DistributedDataParallel for distributed training." ) self.student = DistributedDataParallel( self.student, device_ids=[params.local_rank], output_device=params.local_rank) self.is_master = params.is_master if self.is_master: logger.info('--- Initializing Tensorboard') self.tensorboard = SummaryWriter( log_dir=os.path.join(self.dump_path, 'log', 'train')) self.tensorboard.add_text(tag='config', text_string=str(self.params), global_step=0)
def train(model, criterion, dataset, logger, train_csv_logger, val_csv_logger, test_csv_logger, args, epoch_offset): model = model.cuda() # process generalization adjustment stuff adjustments = [float(c) for c in args.generalization_adjustment.split(',')] assert len(adjustments) in (1, dataset['train_data'].n_groups) if len(adjustments)==1: adjustments = np.array(adjustments* dataset['train_data'].n_groups) else: adjustments = np.array(adjustments) train_loss_computer = LossComputer( criterion, is_robust=args.robust, dataset=dataset['train_data'], alpha=args.alpha, gamma=args.gamma, adj=adjustments, step_size=args.robust_step_size, normalize_loss=args.use_normalized_loss, btl=args.btl, min_var_weight=args.minimum_variational_weight) # BERT uses its own scheduler and optimizer if args.model == 'bert': no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW( optimizer_grouped_parameters, lr=args.lr, eps=args.adam_epsilon) t_total = len(dataset['train_loader']) * args.n_epochs print(f'\nt_total is {t_total}\n') scheduler = WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_steps, t_total=t_total) else: optimizer = torch.optim.SGD( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) # TODO: strong L2 if args.scheduler: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', factor=0.1, patience=5, threshold=0.0001, min_lr=0, eps=1e-08) else: scheduler = None best_val_acc = 0 for epoch in range(epoch_offset, epoch_offset+args.n_epochs): logger.write('\nEpoch [%d]:\n' % epoch) logger.write(f'Training:\n') run_epoch( epoch, model, optimizer, dataset['train_loader'], train_loss_computer, logger, train_csv_logger, args, is_training=True, show_progress=args.show_progress, log_every=args.log_every, scheduler=scheduler) logger.write(f'\nValidation:\n') val_loss_computer = LossComputer( criterion, is_robust=args.robust, dataset=dataset['val_data'], step_size=args.robust_step_size, alpha=args.alpha) run_epoch( epoch, model, optimizer, dataset['val_loader'], val_loss_computer, logger, val_csv_logger, args, is_training=False) # Test set; don't print to avoid peeking if dataset['test_data'] is not None: test_loss_computer = LossComputer( criterion, is_robust=args.robust, dataset=dataset['test_data'], step_size=args.robust_step_size, alpha=args.alpha) run_epoch( epoch, model, optimizer, dataset['test_loader'], test_loss_computer, None, test_csv_logger, args, is_training=False) # Inspect learning rates if (epoch+1) % 1 == 0: for param_group in optimizer.param_groups: curr_lr = param_group['lr'] logger.write('Current lr: %f\n' % curr_lr) if args.scheduler and args.model != 'bert': if args.robust: val_loss, _ = val_loss_computer.compute_robust_loss_greedy(val_loss_computer.avg_group_loss, val_loss_computer.avg_group_loss) else: val_loss = val_loss_computer.avg_actual_loss scheduler.step(val_loss) #scheduler step to update lr at the end of epoch if epoch % args.save_step == 0: torch.save(model, os.path.join(args.log_dir, '%d_model.pth' % epoch)) if args.save_last: torch.save(model, os.path.join(args.log_dir, 'last_model.pth')) if args.save_best: if args.robust or args.reweight_groups: curr_val_acc = min(val_loss_computer.avg_group_acc) else: curr_val_acc = val_loss_computer.avg_acc logger.write(f'Current validation accuracy: {curr_val_acc}\n') if curr_val_acc > best_val_acc: best_val_acc = curr_val_acc torch.save(model, os.path.join(args.log_dir, 'best_model.pth')) logger.write(f'Best model saved at epoch {epoch}\n') if args.automatic_adjustment: gen_gap = val_loss_computer.avg_group_loss - train_loss_computer.exp_avg_loss adjustments = gen_gap * torch.sqrt(train_loss_computer.group_counts) train_loss_computer.adj = adjustments logger.write('Adjustments updated\n') for group_idx in range(train_loss_computer.n_groups): logger.write( f' {train_loss_computer.get_group_name(group_idx)}:\t' f'adj = {train_loss_computer.adj[group_idx]:.3f}\n') logger.write('\n')
def load_or_create_model(args, config_class, model_class, label_count_info, n_gpu, device, num_train_optimization_steps): """ Load model or create model from scratch. Params: `args`: arguments for model and task information `label_count_info`: a dictionary mapping task name to its label count. `n_gpu`: an interger denoting the number of gpus available `device`: either cuda:n or cpu `num_train_optimization_steps`: an integer denoting the total number of iterations (batches) """ model = _build_pretrained_model(args, config_class, model_class, label_count_info) # for transfer learning setup, load pretrained weights in specified path if args.transfer_from is not None: transfer_path = data_utils.TRANSFER_PATH[args.transfer_from] logger.info("Transfer from %s" % args.transfer_from) transfer_model = torch.load(transfer_path) if "model_state" in transfer_model: transfer_model = transfer_model["model_state"] # TODO: needs work to be able to determine whether only loading the # BertModel part (as opposed to load everything including the # classififer model.load_state_dict(transfer_model) model.to(device) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if n_gpu > 1: model = torch.nn.DataParallel(model) return model, optimizer, scheduler
def event_tagger(): # Read event data en_train = read_event_data('en/train.txt') en_dev = read_event_data('en/dev.txt') en_test = read_event_data('en/test.txt') it_train = read_event_data('it/train.txt') it_dev = read_event_data('it/dev.txt') it_test = read_event_data('it/test.txt') print('English TimeML:', len(en_train), len(en_dev), len(en_test)) print('Italian News:', len(it_train), len(it_dev), len(it_test)) tags = list(set(word_label[1] for sent in it_train for word_label in sent)) print(len(tags)) # By convention, the 0'th slot is reserved for padding. tags = ["<pad>"] + tags tag2idx = {tag: idx for idx, tag in enumerate(tags)} idx2tag = {idx: tag for idx, tag in enumerate(tags)} print(tag2idx) print(idx2tag) device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False) model = Net(vocab_size=len(tag2idx), device=device) model.to(device) model = nn.DataParallel(model) # One fine-tuning step train_dataset = EventDataset(en_train, tokenizer, tag2idx) train_iter = data.DataLoader(dataset=train_dataset, batch_size=8, shuffle=True, num_workers=1, collate_fn=pad) eval_dataset = EventDataset(it_test, tokenizer, tag2idx) test_iter = data.DataLoader(dataset=eval_dataset, batch_size=8, shuffle=False, num_workers=1, collate_fn=pad) criterion = nn.CrossEntropyLoss(ignore_index=0) num_epoch = 1 base_lr = 0.001 decay_factor = 0.2 discriminative_fine_tuning = True gradual_unfreezing = False # params order top to bottom group_to_discriminate = ['classifier', 'bert'] no_decay = ['bias', 'LayerNorm.weight'] if discriminative_fine_tuning: optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not 'bert' in n ], 'layers': [ n for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not 'bert' in n ], 'lr': 0.001, 'name': 'classifier.decay', 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not 'bert' in n ], 'layers': [ n for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not 'bert' in n ], 'lr': 0.001, 'name': 'classifier.no_decay', 'weight_decay': 0.0 }, { 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and 'bert' in n ], 'layers': [ n for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and 'bert' in n ], 'lr': 0.00002, 'name': 'bert.decay', 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and 'bert' in n ], 'layers': [ n for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and 'bert' in n ], 'lr': 0.00002, 'name': 'bert.no_decay', 'weight_decay': 0.0 }] else: optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=len(train_iter) * num_epoch // 10, t_total=len(train_iter) * num_epoch) for e in range(num_epoch): unfreeze = (True, False)[e != 0] if discriminative_fine_tuning and gradual_unfreezing: for pg in optimizer.param_groups: layers = '' for layer in pg['layers']: layers += layer + ';' # print('epoch: {}, Layers: {}'.format(e, layers)) if 'bert' in pg['name']: for param in pg['params']: param.requires_grad = unfreeze loss = train(model, train_iter, optimizer, scheduler, criterion) acc = eval(model, test_iter, idx2tag) print("epoch: {}, loss: {}".format(e, loss)) print("epoch: {}, acc: {}".format(e, acc)) ''' ## Second fine-tuning step (epoch=1) train_dataset = EventDataset(it_train, tokenizer, tag2idx) for e in range(num_epoch): unfreeze = (True, False)[e != 0] if discriminative_fine_tuning and gradual_unfreezing: for pg in optimizer.param_groups: layers = '' for layer in pg['layers']: layers += layer + ';' # print('epoch: {}, Layers: {}'.format(e, layers)) if 'bert' in pg['name']: for param in pg['params']: param.requires_grad = unfreeze loss = train(model, train_iter, optimizer, scheduler, criterion) acc = eval(model, test_iter, idx2tag) print("epoch: {}, loss: {}".format(e, loss)) print("epoch: {}, acc: {}".format(e, acc)) ''' calculate_acc() calculate_f1()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='gpt2-medium', help='pretrained model name') parser.add_argument("--do_train", action='store_true', default=True, help="Whether to run training.") parser.add_argument( "--output_dir", default='fintuned_gpt', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--dataset', type=str, default='', required=True) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--opt_level', type=str, default='O1') parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=8) parser.add_argument('--num_prior', type=int, default=2) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset. # start_token, delimiter_token, clf_token special_tokens_dict = { 'cls_token': '<|cls|>', 'unk_token': '<|unk|>', 'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'sep_token': '<|endoftext|>' } tokenizer = GPT2Tokenizer.from_pretrained(args.model_name) num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) print('We have added', num_added_toks, 'tokens') #start_token, delimiter_token, clf_token special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in ['<|endoftext|>', '<|endoftext|>', '<|cls|>']) model = GPT2DoubleHeadsModel.from_pretrained(args.model_name) model.resize_token_embeddings(len(tokenizer)) model.to(device) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_dataset(tokenizer, args.dataset, num_prior=args.num_prior) eval_dataset = load_dataset(tokenizer, args.dataset, num_prior=args.num_prior) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps //\ (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader)\ // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, verbosity=1) nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for i, _ in enumerate(range(int(args.num_train_epochs))): print('Starting Epoch: {} of {}'.format( str(i + 1), str(int(args.num_train_epochs)))) tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, scheduler.get_lr()[0]) if torch.cuda.is_available(): torch.cuda.empty_cache() # Save a trained model # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir) model.to(device)
def main(args): def worker_init_fn(worker_id): np.random.seed(args.random_seed + worker_id) n_gpu = 0 if torch.cuda.is_available(): n_gpu = torch.cuda.device_count() np.random.seed(args.random_seed) random.seed(args.random_seed) rng = random.Random(args.random_seed) torch.manual_seed(args.random_seed) if n_gpu > 0: torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) ontology = json.load(open(args.ontology_data)) slot_meta, ontology = make_slot_meta(ontology) op2id = OP_SET[args.op_code] tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) print(op2id) if os.path.exists(args.train_data_path + ".pk"): train_data_raw = load_data(args.train_data_path + ".pk") else: train_data_raw = prepare_dataset(data_path=args.train_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) train_data = MultiWozDataset(train_data_raw, tokenizer, slot_meta, args.max_seq_length, rng, ontology, args.word_dropout, args.shuffle_state, args.shuffle_p) print("# train examples %d" % len(train_data_raw)) print(len(train_data)) if os.path.exists(args.dev_data_path + ".pk"): dev_data_raw = load_data(args.dev_data_path + ".pk") else: dev_data_raw = prepare_dataset(data_path=args.dev_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) print("# dev examples %d" % len(dev_data_raw)) if os.path.exists(args.test_data_path + ".pk"): test_data_raw = load_data(args.test_data_path + ".pk") else: test_data_raw = prepare_dataset(data_path=args.test_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) print("# test examples %d" % len(test_data_raw)) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = args.dropout model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob model_config.hidden_dropout_prob = args.hidden_dropout_prob model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain) if not os.path.exists(args.bert_ckpt_path): args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets') ckpt = torch.load(args.bert_ckpt_path, map_location='cpu') model.encoder.bert.load_state_dict(ckpt) # re-initialize added special tokens ([SLOT], [NULL], [EOS]) model.encoder.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02) model.encoder.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02) model.encoder.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02) model.to(device) num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] enc_param_optimizer = list(model.encoder.named_parameters()) enc_optimizer_grouped_parameters = [ {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr) enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup), t_total=num_train_steps) dec_param_optimizer = list(model.decoder.parameters()) dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr) dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup), t_total=num_train_steps) # if n_gpu > 1: # model = torch.nn.DataParallel(model) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size, collate_fn=train_data.collate_fn, num_workers=args.num_workers, worker_init_fn=worker_init_fn) loss_fnc = nn.CrossEntropyLoss() best_score = {'epoch': float("-inf"), 'joint_acc_score': float("-inf"), 'op_acc': float("-inf"), 'final_slot_f1': float("-inf")} for epoch in range(args.n_epochs): batch_loss = [] model.train() pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc="training", ncols=0) for step, batch in pbar: batch = [b.to(device) if not isinstance(b, int) else b for b in batch] input_ids, input_mask, segment_ids, state_position_ids, op_ids, \ domain_ids, gen_ids, max_value, max_update = batch if rng.random() < args.decoder_teacher_forcing: # teacher forcing teacher = gen_ids else: teacher = None domain_scores, state_scores, gen_scores = model(input_ids=input_ids, token_type_ids=segment_ids, state_positions=state_position_ids, attention_mask=input_mask, max_value=max_value, op_ids=op_ids, max_update=max_update, teacher=teacher) loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1)) loss_g = masked_cross_entropy_for_value(gen_scores.contiguous(), gen_ids.contiguous(), tokenizer.vocab['[PAD]']) loss = loss_s + loss_g if args.exclude_domain is not True: loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1)) loss = loss + loss_d batch_loss.append(loss.item()) loss.backward() enc_optimizer.step() enc_scheduler.step() dec_optimizer.step() dec_scheduler.step() model.zero_grad() if step % 100 == 0: if args.exclude_domain is not True: print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \ % (epoch + 1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_s.item(), loss_g.item(), loss_d.item())) else: print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \ % (epoch + 1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_s.item(), loss_g.item())) batch_loss = [] if (epoch + 1) % args.eval_epoch == 0: eval_res, res_per_domain, pred = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch + 1, args.op_code) if eval_res['joint_acc_score'] > best_score['joint_acc_score']: best_score['joint_acc_score'] = eval_res['joint_acc_score'] model_to_save = model.module if hasattr(model, 'module') else model save_path = os.path.join(args.out_dir, args.filename + '.bin') torch.save(model_to_save.state_dict(), save_path) print("Best Score : ", best_score) print("\n") print("Test using best model...") best_epoch = best_score['epoch'] ckpt_path = os.path.join(args.out_dir, args.filename + '.bin') model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain) ckpt = torch.load(ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.to(device) best_epoch = 0 #dummpy eval_res, res_per_domain, pred = model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code) # save to file save_result_to_file(args.out_dir + "/" + args.filename + ".res", eval_res, res_per_domain) json.dump(pred, open('%s.pred' % (args.out_dir + "/" + args.filename), 'w'))
def main(): EPOCHS = 500 EARLY_STOP_EPOCHS = 30 SPLIT_WORDS = 'first' # Init Train Dataset posdataset = POSTrainDataset(data_dir, unk_chance=0) loader = DataLoader(posdataset) # Criterion to for loss (weighted) weighted_loss = torch.ones(len(posdataset.ttoi)) for key in posdataset.entities: weighted_loss[posdataset.ttoi[key]] = posdataset.entities[key] weighted_loss = weighted_loss.to(device) criterion = nn.CrossEntropyLoss(weight=weighted_loss) if model_choice == 0: # Hyper Parameters HIDDEN_DIM = 1024 N_LAYERS = 1 LEARNING_RATE = 5e-5 ADAMEPS = 1e-8 SCHEDULER_GAMMA = 0.95 model = BertLSTM(HIDDEN_DIM, N_LAYERS, len(posdataset.ttoi)) # No grad Bert layer for p in model.model.parameters(): p.requires_grad = False model.to(device) optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=ADAMEPS) scheduler = optim.lr_scheduler.ExponentialLR(optimizer, SCHEDULER_GAMMA) # Implement Early stop early_stop = 0 # Model training and eval best_loss = sys.maxsize train_losses = [] eval_losses = [] eval_recall = [] eval_precision = [] for epoch in range(1, EPOCHS + 1): scheduler.step() # Toggle Train set posdataset.train = True trainloss = train(loader, model, optimizer, criterion, device, SPLIT_WORDS) train_losses.append(trainloss) # Toggle Validation Set posdataset.train = False loss, accuracy, precision, recall = eval(loader, model, criterion, device, SPLIT_WORDS) eval_losses.append(loss) eval_recall.append(recall) eval_precision.append(precision) print( 'Epoch {}, Training Loss: {}, Evaluation Loss: {}, Evaluation Accuracy: {}, Evaluation Precision: {}, Evaluation Recall: {}' .format(epoch, trainloss, loss, accuracy, precision, recall)) # Check if current loss is better than previous if loss < best_loss: best_loss = loss torch.save( model, output_model_dir / '{}.pt'.format(models_set[model_choice])) early_stop = 0 # If loss has stagnated, early stop else: early_stop += 1 if early_stop >= EARLY_STOP_EPOCHS: print('Early Stopping') break # Plot respective graphs for visualisation plt.figure() plt.title('{} Model Training'.format(models_set[model_choice])) plt.xlabel('Epoch') plt.ylabel('Loss') plt.plot(train_losses) plt.savefig('{}Training.png'.format(models_set[model_choice])) plt.figure() plt.title('{} Model Evaluation'.format(models_set[model_choice])) plt.xlabel('Epoch') plt.ylabel('Loss') plt.plot(eval_losses) plt.savefig('{}EvalLoss.png'.format(models_set[model_choice])) plt.figure() plt.title('{} Model Evaluation'.format(models_set[model_choice])) plt.xlabel('Epoch') plt.ylabel('Precision') plt.plot(eval_precision) plt.savefig('{}EvalPrec.png'.format(models_set[model_choice])) plt.figure() plt.title('{} Model Evaluation'.format(models_set[model_choice])) plt.xlabel('Epoch') plt.ylabel('Recall') plt.plot(eval_recall) plt.savefig('{}EvalRecall.png'.format(models_set[model_choice]))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps //\ (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader)\ // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, scheduler.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids, lm_labels, mc_labels) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
1e-8 }, { 'params': [ param for name, param in model.named_parameters() if not any(identifier in name for identifier in bert_identifiers) ], 'lr': custom_learning_rate, 'betas': (0.9, 0.999), 'weight_decay': 0.0, 'eps': 1e-8 }] # Define optimizer optimizer = AdamW(grouped_model_parameters) else: # Define optimizer optimizer = optim.Adam(params=model.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-8) # Place model & loss function on GPU model, criterion = model.to(DEVICE), criterion.to(DEVICE) # Start actual training, check test loss after each epoch best_test_loss = float('inf') for epoch in range(NUM_EPOCHS): print("EPOCH NO: %d" % (epoch + 1))
def train(args, train_dataset, model_vae, encoder_tokenizer, decoder_tokenizer, table_name): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) # model_encoder, model_decoder, model_connector = model_vae.encoder, model_vae.decoder, model_vae.linear no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model_vae.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model_vae.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model_vae, optimizer = amp.initialize(model_vae, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model_vae = torch.nn.DataParallel(model_vae, device_ids=range(args.n_gpu)).to(args.device) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model_vae = torch.nn.parallel.DistributedDataParallel(model_vae, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) model_vae = model_vae.module if hasattr(model_vae, 'module') else model_vae # Take care of distributed/parallel training # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model_vae.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) n_iter = int(args.num_train_epochs) * len(train_dataloader) beta_t_list = frange_cycle_zero_linear(n_iter, start=0.0, stop=args.beta, n_cycle=1, ratio_increase=args.ratio_increase, ratio_zero=args.ratio_zero) tmp_list = [] set_seed(args) # Added here for reproducibility (even between python 2 and 3) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): tokenized_text0, tokenized_text1, tokenized_text_lengths = batch # tokenized_text0 = tokenized_text0.to(args.device) # tokenized_text1 = tokenized_text1.to(args.device) # prepare input-output data for reconstruction # pdb.set_trace() max_len_values, _ = tokenized_text_lengths.max(0) tokenized_text0 = tokenized_text0[:,:max_len_values[0]] tokenized_text1 = tokenized_text1[:,:max_len_values[1]] inputs, labels = mask_tokens(tokenized_text0, encoder_tokenizer, args) if args.mlm else (tokenized_text0, tokenized_text1) labels = tokenized_text1 tokenized_text1 = tokenized_text1.to(args.device) inputs = inputs.to(args.device) labels = labels.to(args.device) model_vae.train() beta_t = beta_t_list[step + epoch*len(epoch_iterator)] model_vae.args.beta = beta_t if beta_t == 0.0: model_vae.args.fb_mode = 0 else: model_vae.args.fb_mode = 1 if args.use_deterministic_connect: model_vae.args.fb_mode = 2 loss_rec, loss_kl, loss = model_vae(inputs, labels) # pdb.set_trace() # Chunyuan: loss_rec size is [4], while latent_z size is [12] if args.n_gpu > 1: loss_rec = loss_rec.mean() # mean() to average on multi-gpu parallel training loss_kl = loss_kl.mean() loss = loss.mean() if args.use_philly: print("PROGRESS: {}%".format(round(100 * (step + epoch*len(epoch_iterator) ) /(int(args.num_train_epochs) * len(epoch_iterator)) , 4))) print("EVALERR: {}%".format(loss_rec)) epoch_iterator.set_description( ( f'iter: {step + epoch*len(epoch_iterator) }; loss: {loss.item():.3f}; ' f'loss_rec: {loss_rec.item():.3f}; loss_kl: {loss_kl.item():.3f}; ' f'beta: {model_vae.args.beta:.3f}' ) ) if global_step % 5 == 0: row = { 'PartitionKey': 'MILU_Rule_Rule_Template', 'RowKey': str(datetime.now()), 'ExpName' : args.ExpName, 'iter': str( step + epoch*len(epoch_iterator) ), 'loss': str( loss.item()), 'loss_rec': str(loss_rec.item()), 'loss_kl': str(loss_kl.item()), 'beta': str(model_vae.args.beta) } # pdb.set_trace() ts.insert_entity(table_name, row) # pdb.set_trace() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model_vae.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model_vae.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model_vae, encoder_tokenizer, decoder_tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save encoder model checkpoint output_encoder_dir = os.path.join(args.output_dir, 'checkpoint-encoder-{}'.format(global_step)) if not os.path.exists(output_encoder_dir): os.makedirs(output_encoder_dir) model_encoder_to_save = model_vae.module.encoder if hasattr(model_vae, 'module') else model_vae.encoder # Take care of distributed/parallel training if args.use_philly: save_solid = False while not save_solid: try: model_encoder_to_save.save_pretrained(output_encoder_dir) torch.save(args, os.path.join(output_encoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_encoder_dir) save_solid = True except: pass else: model_encoder_to_save.save_pretrained(output_encoder_dir) torch.save(args, os.path.join(output_encoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_encoder_dir) # Save decoder model checkpoint output_decoder_dir = os.path.join(args.output_dir, 'checkpoint-decoder-{}'.format(global_step)) if not os.path.exists(output_decoder_dir): os.makedirs(output_decoder_dir) model_decoder_to_save = model_vae.module.decoder if hasattr(model_vae, 'module') else model_vae.decoder # Take care of distributed/parallel training if args.use_philly: save_solid = False while not save_solid: try: model_decoder_to_save.save_pretrained(output_decoder_dir) torch.save(args, os.path.join(output_decoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_decoder_dir) save_solid = True except: pass else: model_decoder_to_save.save_pretrained(output_decoder_dir) torch.save(args, os.path.join(output_decoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_decoder_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str) parser.add_argument("--eval_file", default=None, type=str) parser.add_argument("--test_file", default=None, type=str) parser.add_argument("--inference_file", default=None, type=str) parser.add_argument("--model_name_or_path", default=None, type=str) parser.add_argument("--output_dir", default=None, type=str) ## other parameters parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=256, type=int) parser.add_argument("--do_train", default=False, type=boolean_string) parser.add_argument("--do_eval", default=False, type=boolean_string) parser.add_argument("--do_test", default=False, type=boolean_string) parser.add_argument("--resume", default=False, type=boolean_string) parser.add_argument("--do_inference", default=False, type=boolean_string) parser.add_argument("--train_batch_size", default=8, type=int) parser.add_argument("--eval_batch_size", default=8, type=int) parser.add_argument("--learning_rate", default=3e-5, type=float) parser.add_argument("--num_train_epochs", default=10, type=float) parser.add_argument("--warmup_proprotion", default=0.1, type=float) parser.add_argument("--use_weight", default=1, type=int) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--seed", type=int, default=2019) parser.add_argument("--fp16", default=False) parser.add_argument("--loss_scale", type=float, default=0) parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--warmup_steps", default=0, type=int) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--max_steps", default=-1, type=int) parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--logging_steps", default=500, type=int) parser.add_argument("--clean", default=False, type=boolean_string, help="clean the output dir") parser.add_argument("--need_birnn", default=False, type=boolean_string) parser.add_argument("--rnn_dim", default=128, type=int) args = parser.parse_args() device = torch.device("cuda") # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ args.device = device n_gpu = torch.cuda.device_count() logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO) logger.info(f"device: {device} n_gpu: {n_gpu}") if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) # now_time = datetime.datetime.now().strftime('%Y-%m-%d_%H') # tmp_dir = args.output_dir + '/' +str(now_time) + '_ernie' # if not os.path.exists(tmp_dir): # os.makedirs(tmp_dir) # args.output_dir = tmp_dir if args.clean and args.do_train and not args.resume: # logger.info("清理") if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) print(c_path) if os.path.isdir(c_path): del_file(c_path) os.rmdir(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.resume: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if not os.path.exists(os.path.join(args.output_dir, "eval")): os.makedirs(os.path.join(args.output_dir, "eval")) writer = SummaryWriter(logdir=os.path.join(args.output_dir, "eval"), comment="Linear") processor = NerProcessor() label_list = processor.get_labels(args) num_labels = len(label_list) args.label_list = label_list if os.path.exists(os.path.join(args.output_dir, "label2id.pkl")): with open(os.path.join(args.output_dir, "label2id.pkl"), "rb") as f: label2id = pickle.load(f) else: label2id = {l:i for i,l in enumerate(label_list)} with open(os.path.join(args.output_dir, "label2id.pkl"), "wb") as f: pickle.dump(label2id, f) id2label = {value:key for key,value in label2id.items()} # Prepare optimizer and schedule (linear warmup and decay) if args.do_train: if args.resume: tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.output_dir, num_labels=num_labels) model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir, config=config, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) else: tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels) model = BERT_BiLSTM_CRF.from_pretrained(args.cache_dir if args.cache_dir else args.model_name_or_path, config=config, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) train_examples, train_features, train_data = get_Dataset(args, processor, tokenizer, mode="train") train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval: eval_examples, eval_features, eval_data = get_Dataset(args, processor, tokenizer, mode="eval") if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps =t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Total optimization steps = %d", t_total) model.train() global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_f1 = 0.0 for ep in trange(int(args.num_train_epochs), desc="Epoch"): model.train() for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch outputs = model(input_ids, label_ids, segment_ids, input_mask) loss = outputs if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: tr_loss_avg = (tr_loss-logging_loss)/args.logging_steps writer.add_scalar("Train/loss", tr_loss_avg, global_step) logging_loss = tr_loss if args.do_eval: all_ori_tokens_eval = [f.ori_tokens for f in eval_features] overall, by_type = evaluate(args, eval_data, model, id2label, all_ori_tokens_eval) # add eval result to tensorboard f1_score = overall.fscore writer.add_scalar("Eval/precision", overall.prec, ep) writer.add_scalar("Eval/recall", overall.rec, ep) writer.add_scalar("Eval/f1_score", overall.fscore, ep) # save the best performs model if f1_score >= best_f1: logger.info(f"----------the best f1 is {f1_score}---------") best_f1 = f1_score model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # logger.info(f'epoch {ep}, train loss: {tr_loss}') # writer.add_graph(model) writer.close() # model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training # model_to_save.save_pretrained(args.output_dir) # tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model # torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) if args.do_test: # model = BertForTokenClassification.from_pretrained(args.output_dir) # model.to(device) label_map = {i : label for i, label in enumerate(label_list)} tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) #args = torch.load(os.path.join(args.output_dir, 'training_args.bin')) model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) model.to(device) test_examples, test_features, test_data = get_Pred_Dataset(args, processor, tokenizer, mode="test") logger.info("***** Running test *****") logger.info(f" Num examples = {len(test_examples)}") logger.info(f" Batch size = {args.eval_batch_size}") all_ori_tokens = [f.ori_tokens for f in test_features] all_ori_labels = [e.label.split(" ") for e in test_examples] test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() pred_labels = [] for b_i, (input_ids, input_mask, segment_ids, label_ids) in enumerate(tqdm(test_dataloader, desc="Predicting")): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model.predict(input_ids, segment_ids, input_mask) # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) # logits = logits.detach().cpu().numpy() for l in logits: pred_label = [] for idx in l: pred_label.append(id2label[idx]) pred_labels.append(pred_label) assert len(pred_labels) == len(all_ori_tokens) == len(all_ori_labels) print(len(pred_labels)) with open(os.path.join(args.output_dir, "token_labels_.txt"), "w", encoding="utf-8") as f: for ori_tokens, ori_labels,prel in zip(all_ori_tokens, all_ori_labels, pred_labels): for ot,ol,pl in zip(ori_tokens, ori_labels, prel): if ot in ["[CLS]", "[SEP]"]: continue else: f.write(f"{ot} {ol} {pl}\n") f.write("\n") if args.do_inference: label_map = {i : label for i, label in enumerate(label_list)} tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) #args = torch.load(os.path.join(args.output_dir, 'training_args.bin')) model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) model.to(device) test_examples, test_features, test_data = get_Pred_Dataset(args, processor, tokenizer, mode="inference") logger.info("***** Running test *****") logger.info(f" Num examples = {len(test_examples)}") logger.info(f" Batch size = {args.eval_batch_size}") all_ori_tokens = [f.ori_tokens for f in test_features] all_ori_labels = [e.label.split(" ") for e in test_examples] test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() pred_labels = [] for b_i, (input_ids, input_mask, segment_ids, label_ids) in enumerate(tqdm(test_dataloader, desc="Predicting")): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model.predict(input_ids, segment_ids, input_mask) # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) # logits = logits.detach().cpu().numpy() for l in logits: pred_label = [] for idx in l: pred_label.append(id2label[idx]) pred_labels.append(pred_label) assert len(pred_labels) == len(all_ori_tokens) == len(all_ori_labels) print(len(pred_labels)) with open(os.path.join(args.output_dir, "token_labels_.txt"), "w", encoding="utf-8") as f: for ori_tokens, ori_labels,prel in zip(all_ori_tokens, all_ori_labels, pred_labels): for ot,ol,pl in zip(ori_tokens, ori_labels, prel): if ot in ["[CLS]", "[SEP]"]: continue else: f.write(f"{ot} {ol} {pl}\n") f.write("\n")
def run_train(args): # --------- data processor = XlnetProcessor(vocab_path=str(config['xlnet_vocab_path']), do_lower_case=args.do_lower_case) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} train_data = processor.get_train(config['data_dir'] / f"{args.data_name}.train.pkl") train_examples = processor.create_examples(lines=train_data, example_type='train', cached_examples_file=config[ 'data_dir'] / f"cached_train_examples_{args.arch}") train_features = processor.create_features(examples=train_examples, max_seq_len=args.train_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_train_features_{}_{}".format( args.train_max_seq_len, args.arch )) train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_data = processor.get_dev(config['data_dir'] / f"{args.data_name}.valid.pkl") valid_examples = processor.create_examples(lines=valid_data, example_type='valid', cached_examples_file=config[ 'data_dir'] / f"cached_valid_examples_{args.arch}") valid_features = processor.create_features(examples=valid_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_valid_features_{}_{}".format( args.eval_max_seq_len, args.arch )) valid_dataset = processor.create_dataset(valid_features) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) # ------- model logger.info("initializing model") if args.resume_path: args.resume_path = Path(args.resume_path) model = XlnetForMultiLable.from_pretrained(args.resume_path, num_labels=len(label_list)) else: model = XlnetForMultiLable.from_pretrained(config['xlnet_model_dir'], num_labels=len(label_list)) t_total = int(len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) # Prepare optimizer and schedule (linear warmup and decay) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # ---- callbacks logger.info("initializing callbacks") train_monitor = TrainingMonitor(file_dir=config['figure_dir'], arch=args.arch) model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'], mode=args.mode, monitor=args.monitor, arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num Epochs = %d", args.epochs) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) trainer = Trainer(n_gpu=args.n_gpu, model=model, epochs=args.epochs, logger=logger, criterion=BCEWithLogLoss(), optimizer=optimizer, lr_scheduler=lr_scheduler, early_stopping=None, training_monitor=train_monitor, fp16=args.fp16, resume_path=args.resume_path, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, batch_metrics=[AccuracyThresh(thresh=0.5)], epoch_metrics=[AUC(average='micro', task_type='binary'), MultiLabelReport(id2label=id2label)]) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, seed=args.seed)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, help="pretrained_model.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_probe", action='store_true', help="Whether to probe the representation we got.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( '--data_dir', type=str, default= '/home/xiongyi/dataxyz/repos/SemSynLSTM/word_language_model/data/wikitext-2/' ) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) timenow = datetime.datetime.now().strftime("%b%d%H%M") model_option = 'adv' outdir = model_option + timenow args = parser.parse_args( ['--output_dir', outdir, '--do_probe', '--num_train_epochs', '50']) #args = parser.parse_args(['--output_dir', './tmp', '--do_eval', '--model_name', 'gpt2']) print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval and not args.do_probe: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Compute the max input length for the Transformer # Todo: Where is this used? input_length = 128 data_dir = '../SemSynLSTM/word_language_model/data/wikitext-2/' if args.data_dir is None else args.data_dir train_set, val_set, test_set, dictionary, pos_dictionary = load_tokenize_and_batchify( data_dir, input_length) # Prepare inputs tensors and dataloaders train_data = TensorDataset(*train_set) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32) eval_data = TensorDataset(*val_set) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=32) # TODO: Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset #special_tokens = ['_start_', '_delimiter_'] #special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens) # TODO: Add config config = GPT2Config(n_positions=input_length, n_ctx=input_length, n_layer=6, n_head=8, n_embd=384) config.vocab_size = dictionary.__len__() config.pos_vocab_size = pos_dictionary.__len__() if args.model_name: model = GPT2LMHeadModel.from_pretrained(args.model_name) else: model = GPT2_adverse(config=config) model.to(device) # TODO: Load and encode the datasets logger.info("Encoding dataset...") # Prepare optimizer if args.do_train: all_param = list(model.named_parameters()) param_optimizer = [(n, p) for n, p in all_param if 'pos_head_adv' not in n] param_optimizer_adv = [(n, p) for n, p in all_param if 'pos_head_adv' in n] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer_adv_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer_adv if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer_adv if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_dataloader) * args.num_train_epochs optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, #max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) #t_total=num_train_optimization_steps) optimizer_adv = AdamW( optimizer_adv_grouped_parameters, lr=args.learning_rate, #max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) if args.do_train: train_results = {} nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None for epoch in trange(int(args.num_train_epochs), desc="Epoch"): ###eval on eval set model.eval() nb_eval_steps, nb_eval_examples = 0, 0 perp = 0 average_loss = np.asanyarray([0, 0, 0, 0], dtype='float') for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch with torch.no_grad(): #breakpoint() loss = model( input_ids, labels=input_ids, pos_ids=input_pos_ids)[0].detach().cpu().numpy() loss_syn = model( input_ids, labels=input_ids, pos_ids=input_pos_ids)[1].detach().cpu().numpy() loss_sem = model( input_ids, labels=input_ids, pos_ids=input_pos_ids)[2].detach().cpu().numpy() loss_lm = model( input_ids, labels=input_ids, pos_ids=input_pos_ids)[3].detach().cpu().numpy() perp_batch = np.exp(loss_lm) perp += perp_batch average_loss += np.asanyarray( [loss, loss_syn, loss_sem, loss_lm]) nb_eval_steps += 1 perp /= nb_eval_steps average_loss /= nb_eval_steps print('loss,loss_syn,loss_sem,loss_lm', average_loss, 'perp ', perp, 'epoch ', epoch) train_results[epoch] = (perp, average_loss) model.train() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch loss = model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[0] loss_lm = model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[3] loss_sem = model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[2] #breakpoint() #loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() loss_sem.backward() optimizer_adv.step() optimizer_adv.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} sem: {:.2e} lm: {:.2e}".format( exp_average_loss, loss_sem.item(), loss_lm.item()) print(train_results) # Save a trained model if args.do_train: all_param = list(model.named_parameters()) param_optimizer = [(n, p) for n, p in all_param if 'pos_head_adv' not in n] param_optimizer_adv = [(n, p) for n, p in all_param if 'pos_head_adv' in n] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer_adv_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer_adv if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer_adv if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_dataloader) * args.num_train_epochs optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, #max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) #t_total=num_train_optimization_steps) optimizer_adv = AdamW( optimizer_adv_grouped_parameters, lr=args.learning_rate, #max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) #tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = GPT2LMHeadModel.from_pretrained(args.output_dir) #tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() nb_eval_steps, nb_eval_examples = 0, 0 log_probs_sum = 0 perp = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch with torch.no_grad(): loss = model(input_ids, labels=input_ids)[0].detach().cpu().numpy() perp_batch = np.exp(loss) perp += perp_batch nb_eval_steps += 1 perp /= nb_eval_steps # perp_word = perp / 128 print(perp) result = {'eval_perp': perp} logger.info("***** Eval results *****") logger.info("'eval_perp' = %s", str(result['eval_perp'])) if args.do_probe: ##load model (how???) model_path = '/home/xiongyi/dataxyz/repos/pytorch-pretrained-BERT/examples/advJul232307/pytorch_model.bin' model.load_state_dict(torch.load(model_path)) ##Add a mlp to the representation probe_model = ProbeModel(model, config) probe_model.to(device) ##train and eval all_param = list(probe_model.named_parameters()) param_probe = [(n, p) for n, p in all_param if 'probe_cls' in n] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_probe if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_probe if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, # max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) # t_total=num_train_optimization_steps) train_results = {} nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None for epoch in trange(int(args.num_train_epochs), desc="Epoch"): ###eval on eval set probe_model.eval() nb_eval_steps, nb_eval_examples = 0, 0 average_loss = 0 average_acc = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch with torch.no_grad(): #breakpoint() loss = probe_model( input_ids, labels=input_ids, pos_ids=input_pos_ids)[0].detach().cpu().numpy() pos_logits = probe_model( input_ids, labels=input_ids, pos_ids=input_pos_ids)[1].detach().cpu().numpy() predicted_labels = np.argmax(pos_logits, -1) correct_rate = np.mean(predicted_labels == input_pos_ids. detach().cpu().numpy()[:, 1:]) average_acc += correct_rate average_loss += loss nb_eval_steps += 1 average_loss /= nb_eval_steps ##TODO Hard CODED! average_acc /= nb_eval_steps print('loss', average_loss, ' acc_rate ', average_acc, ' epoch ', epoch) train_results[epoch] = (average_loss, average_acc) probe_model.train() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch loss = probe_model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[0] # breakpoint() # loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e}".format( exp_average_loss)
def train(args, train_dataset, model, tokenizer, label_2test_array): """ Train the model """ num_labels = len(label_2test_array) print ('num_labels {}'.format(num_labels)) if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) ## track best loss on eval set ?? eval_loss = np.inf last_best = 0 break_early = False set_seed(args) # Added here for reproducibility (even between python 2 and 3) for epoch_counter in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # inputs, labels, attention_mask = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) ## !!! WE ARE NOT GOING TO TRAIN MASKED-LM max_len_in_batch = int( torch.max ( torch.sum(batch[3],1) ) ) ## only need max len of AA input_ids_aa = batch[1][:,0:max_len_in_batch].to(args.device) input_ids_label = batch[2].to(args.device) ## also pass in SEP attention_mask = torch.cat( (batch[3][:,0:max_len_in_batch] , torch.ones(input_ids_label.shape, dtype=torch.long) ), dim=1 ).to(args.device) labels = batch[0].to(args.device) ## already in batch_size x num_label ## must append 0 positions to the front, so that we mask out AA labels_mask = torch.cat((torch.zeros(input_ids_aa.shape), torch.ones(input_ids_label.shape)),dim=1).to(args.device) ## SEP is at last position on label size ??? should there be one ?? # labels_mask[:,-1] = 0 ## must mask SEP in the label side # labels_mask = labels_mask.to(args.device) ## test all labels model.train() # call to the @model # def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, # position_ids=None, head_mask=None, attention_mask_label=None): outputs = model(0, input_ids_aa=input_ids_aa, input_ids_label=input_ids_label, token_type_ids=None, attention_mask=attention_mask, labels=labels, position_ids=None, attention_mask_label=labels_mask ) # if args.mlm else model(inputs, labels=labels) loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break ## end 1 epoch print ('\n\neval end epoch {}'.format(epoch_counter)) ## to save some time, let's just save at end of epoch output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) results = evaluate(args, model, tokenizer,label_2test_array, prefix=str(global_step)) if results['eval_loss'] < eval_loss: eval_loss = results['eval_loss'] last_best = epoch_counter break_early = False print ('\nupdate lowest loss on epoch {}, {}\nreset break_early to False, see break_early variable {}'.format(epoch_counter,eval_loss,break_early)) else: if epoch_counter - last_best > 5 : ## break counter after 5 epoch # break ## break early break_early = True print ('epoch {} set break_early to True, see break_early variable {}'.format(epoch_counter,break_early)) if break_early: train_iterator.close() print ("**** break early ****") break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch (lm_loss), (mc_loss), *_ = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels) model = BertForSequenceClassification.from_pretrained( self.model_name_or_path, self.args, config=config) model.to(self.device) train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader( ) num_train_optimization_steps = self.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % ( self.eval_steps * self.gradient_accumulation_steps) == 0: inference_labels = [] gold_labels = [] inference_logits = [] scores = [] questions = [x.text_a for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) # Run prediction for full data model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) scores.append(logits) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) scores = np.concatenate(scores, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracyCQA(inference_logits, gold_labels) eval_mrr = compute_MRR_CQA(scores, gold_labels, questions) # eval_5R20 = compute_5R20(scores,gold_labels,questions) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'eval_MRR': eval_mrr, # 'eval_5R20':eval_5R20, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join(self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--ernie_model", default="bert-base-cased", type=str, help="Ernie pre-trained model") parser.add_argument("--embedding", default="wikipedia2vec-base-cased", type=str, help="Embeddings") parser.add_argument( "--mapper", default="wikipedia2vec-base-cased.bert-base-cased.linear", type=str, help="Embeddings") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--ent", required=True, choices=("none", "concat", "replace"), help="How to use entity embeddings.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", default=False, action='store_true', help="Whether to run eval on the test set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=4, type=int, help="Total batch size for evaluation.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--threshold', type=float, default=.3) args = parser.parse_args() processors = FewrelProcessor num_labels_task = 80 if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) processor = processors() num_labels = num_labels_task label_list = None embedding = MappedEmbedding(load_embedding(args.embedding), load_mapper(args.mapper)) model_embedding = load_embedding(args.ernie_model) tokenizer = BertTokenizer.from_pretrained(args.ernie_model) train_examples = None num_train_steps = None train_examples, label_list = processor.get_train_examples(args.data_dir) label_list = sorted(label_list, key=lambda x: int(x[1:])) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) model = EmbInputBertForSequenceClassification.from_pretrained( args.ernie_model, num_labels=num_labels, output_attentions=True) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_proportion * t_total, t_total=t_total) global_step = 0 output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") patterns = ["# {name} #", "$ {name} $"] if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, args.threshold, patterns=patterns, ent_type=args.ent, embedding=embedding) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) output_loss_file = os.path.join(args.output_dir, "loss") loss_fout = open(output_loss_file, 'w') model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): input_words = tokenizer.convert_ids_to_tokens( batch[0].cpu().numpy().flatten()) input_vecs = [ embedding[w] if w.startswith("ENTITY/") else model_embedding[w] for w in input_words ] input_vecs = np.array(input_vecs).reshape(batch[0].shape + (-1, )) batch[0] = torch.tensor(input_vecs) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() loss_fout.write("{}\n".format(loss.item())) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 model_to_save = model.module if hasattr(model, 'module') else model output_model_file_step = os.path.join( args.output_dir, "pytorch_model.bin_{}".format(global_step)) torch.save(model_to_save.state_dict(), output_model_file_step) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) if args.do_eval or args.do_test: del model output_model_files = [ f for f in os.listdir(args.output_dir) if f.startswith("pytorch_model.bin") ] #output_model_files = ["pytorch_model.bin"] #TODO for output_model_file in output_model_files: model = EmbInputBertForSequenceClassification.from_pretrained( args.ernie_model, num_labels=num_labels) model.load_state_dict( torch.load(os.path.join(args.output_dir, output_model_file))) if args.fp16: model.half() model.to(device) model.eval() dsets = [] if args.do_eval: dsets.append((processor.get_dev_examples, "eval")) if args.do_test: dsets.append((processor.get_test_examples, "test")) for dset_func, dset_name in dsets: features = convert_examples_to_features(dset_func( args.data_dir), label_list, args.max_seq_length, tokenizer, args.threshold, patterns=patterns, ent_type=args.ent, embedding=embedding) step = output_model_file.replace("pytorch_model.bin", "") fpred = open( os.path.join(args.output_dir, dset_name + f"_pred{step}.txt"), "w") fgold = open( os.path.join(args.output_dir, dset_name + f"_gold{step}.txt"), "w") fwords = open( os.path.join(args.output_dir, dset_name + f"_words{step}.txt"), "w") all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) dataloader = DataLoader(data, sampler=None, batch_size=args.eval_batch_size) acc = [] all_probs = [] for step, batch in enumerate( tqdm(dataloader, desc="Evaluation {} {}".format( output_model_file, dset_name))): input_words = tokenizer.convert_ids_to_tokens( batch[0].cpu().numpy().flatten()) input_vecs = [ embedding[w] if w.startswith("ENTITY/") else model_embedding[w] for w in input_words ] input_vecs = np.array(input_vecs).reshape(batch[0].shape + (-1, )) batch[0] = torch.tensor(input_vecs) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids)[0] prob = torch.softmax(logits, -1) all_probs.append(prob.detach().cpu().numpy()) predictions = prob.argmax(-1) for a, b in zip(predictions, label_ids): fgold.write("{}\n".format(label_list[b])) fpred.write("{}\n".format(label_list[a]))