def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: random_number = np.random.randint(10000) tb_writer = SummaryWriter(log_dir='./imdb_runs/bert_' + str(random_number)) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_dataloader = sample_loader( train_dataset, batch_size=args.train_batch_size, k=args.k, n_classes=2, seed=args.seed, pos_sampling_ratio=args.pos_sampling_ratio) # train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // len(train_dataloader) + 1 else: t_total = len(train_dataloader) * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) N = 14 # for bert-base embeddings + 12 layers + classifier for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): if global_step % args.layer_step == 0: step = global_step // args.layer_step layer_sep = max(min(int(N - (step % N + 1)), N - 1), 1) # 1, ..., 13 layer_list_feature = [ '.' + str(i) + '.' for i in range(layer_sep) ] # update layer separating point optimizer_grouped_parameters_feature = [{ 'params': [ p for (n, p) in model.named_parameters() if any( l in n for l in ['embeddings'] + layer_list_feature) and not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for (n, p) in model.named_parameters() if any(l in n for l in ['embeddings'] + layer_list_feature) and any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer_grouped_parameters_classifier = [{ 'params': [ p for (n, p) in model.named_parameters() if not any(l in n for l in ['embeddings'] + layer_list_feature) and not any( nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for (n, p) in model.named_parameters() if not any(l in n for l in ['embeddings'] + layer_list_feature) and any( nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer_feature = AdamW(optimizer_grouped_parameters_feature, lr=args.learning_rate, eps=args.adam_epsilon) optimizer_classifier = AdamW( optimizer_grouped_parameters_classifier, lr=args.learning_rate, eps=args.adam_epsilon) # scheduler = WarmupLinearSchedule(optimizer_feature, warmup_steps=args.warmup_steps, t_total=t_total) # begin training model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': None } '''Initialize''' if epoch == 0 and step == 0: ouputs = model(**inputs) with torch.no_grad(): logits_1 = ouputs[0] logits_2 = ouputs[0] '''Update the model''' # scheduler_feature.step() # Update learning rate schedule # scheduler_classifier.step() # update classifier optimizer_classifier.step() with torch.no_grad(): ouputs = model(**inputs) logits_1 = ouputs[0] # update feature optimizer_feature.step() ouputs = model(**inputs) logits_2 = ouputs[0] model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) # tb_writer.add_scalar('lr', scheduler_feature.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) w = weight_schedule(global_step, ramp_up_epochs=t_total // 3, ramp_down_epochs=t_total // 10, total_epochs=t_total, max_val=args.max_val, mult=-5., mult_down=-7., n_labeled=args.k, n_samples=75000) tb_writer.add_scalar('layer_sep', layer_sep, global_step) tb_writer.add_scalar('w', w, global_step) w = torch.autograd.Variable(torch.FloatTensor([w]).cuda(), requires_grad=False) '''Calculate the loss''' loss, sup_loss, unsup_loss, nbsup = gul_loss( logits_2, logits_1, w, batch[3]) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training '''Calculate the gradients''' if args.fp16: raise NotImplementedError() else: loss.backward(retain_graph=True) torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() tb_writer.add_scalar('total_loss', loss.item(), global_step) tb_writer.add_scalar('sup_loss', sup_loss.item(), global_step) tb_writer.add_scalar('unsup_loss', unsup_loss.item(), global_step) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def _train(args): # initialization config_class, model_class, tokenizer_class = MODEL_CLASSES[ args['model_type']] config = config_class.from_pretrained(args['model_name'], num_labels=2, finetuning_task=args['task_name']) tokenizer = tokenizer_class.from_pretrained(args['model_name']) model = model_class.from_pretrained(args['model_name']) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if torch.cuda.device_count() > 1: print("Training: use", torch.cuda.device_count(), "GPUs!") model = torch.nn.DataParallel(model) model.to(device) logger.info("Loading dataset") train_dataset = load_and_cache_examples(args, tokenizer, False) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size']) print("len(train_dataloader) " + str(len(train_dataloader))) t_total = len(train_dataloader) // args[ 'gradient_accumulation_steps'] * args['num_train_epochs'] no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args['weight_decay'] }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon']) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args['warmup_steps'], t_total=t_total) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args['num_train_epochs']) logger.info(" Total train batch size = %d", args['train_batch_size']) logger.info(" Gradient Accumulation steps = %d", args['gradient_accumulation_steps']) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() for _ in range(args['num_train_epochs']): for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3] } outputs = model(**inputs) loss = outputs[0].mean( ) # model outputs are always tuple in pytorch-transformers (see doc) print("\r%f" % loss, end='') if args['gradient_accumulation_steps'] > 1: loss = loss / args['gradient_accumulation_steps'] if args['fp16']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm']) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm']) tr_loss += loss.item() if (step + 1) % args['gradient_accumulation_steps'] == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args['logging_steps'] > 0 and global_step % args[ 'logging_steps'] == 0: logging_loss = tr_loss if args['save_steps'] > 0 and global_step % args[ 'save_steps'] == 0: # Save model checkpoint output_dir = os.path.join( args['output_dir'], 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) logger.info("Saving model checkpoint to %s", output_dir) logger.info("starting evaluating ") checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args['output_dir'] + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) best_result = None best_checkpoint = None results = [] for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(device) result = evaluate(model, tokenizer, prefix=global_step) logger.info(" result,{%s}", result) if best_result is None or result['matthews_corrcoef'] > best_result[ 'matthews_corrcoef']: best_result = result best_checkpoint = checkpoint logger.info("best result, Saving model checkpoint to %s", best_checkpoint) result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.append(result) # save best model model = model_class.from_pretrained(best_checkpoint) model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args['model_dir'])
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0], ncols=8) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3] } # print(inputs) # from pdb import set_trace; set_trace() outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() # logging epoch_iterator.desc = "[{}] Loss:{:.2f} lr:{:.1e}".format( epoch, tr_loss / (step + 1), scheduler.get_lr()[0]) if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint # output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) output_dir = os.path.join(args.output_dir, 'checkpoint') if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) # logger.info("Saving model checkpoint to %s", output_dir # TODO (1) validation, (2) early stopping (3) save multiple checkpints if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
#print("Source:", row["source"]) #print("Label:", row["source_label"]) #print("") print("Bert...") start_time = time() max_val_accs = defaultdict(list) max_test_accs = defaultdict(list) for test_fold in range(args.n_folds): print("Fold", test_fold) model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=3) model.to(torch.device("cuda")) optimizer = AdamW(model.parameters(), lr=args.learn_rate, eps=args.adam_epsilon) test_data = fold2data[test_fold] val_fold = (test_fold + 1) % args.n_folds val_data = fold2data[val_fold] train_data = [row for fold, data in fold2data.items() for row in data \ if fold not in [test_fold, val_fold]] acc = defaultdict(lambda: None) for epoch in range(args.n_epochs): print("Epoch:", epoch + 1) model.train() train_loss = val_loss = test_loss = 0 for row in train_data:
def main(): parser = argparse.ArgumentParser(description='openGPT-2 analysis') parser.add_argument( '--mode', choices=['train', 'eval-singletoken', 'eval-completion', 'eval-both'], default='eval-singletoken') parser.add_argument('--eval-split', choices=['train', 'valid', 'test']) parser.add_argument('--model-name', choices=['gpt2', 'gpt2-medium', 'gpt2-large'], default='gpt2-medium') parser.add_argument('--model-load-dir', type=str, default=None) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--data-base', type=str) parser.add_argument('--num-train-epochs', type=int, default=1) parser.add_argument('--batch-size-singletoken', type=int, default=1024) parser.add_argument('--batch-size-completion', type=int, default=300) parser.add_argument( "--output-dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) # eval-completion parser.add_argument('--prefix-length', type=int, default=50) parser.add_argument('--continuation-length', type=int, default=100) parser.add_argument('--top-k', type=int, default=1) parser.add_argument('--top-p', type=float, default=0.0) # custom training parser.add_argument('--sequence-tune-rate', type=float, default=0.5) parser.add_argument('--train-batch-size', type=int, default=300) parser.add_argument('--report-metrics-every', type=int, default=10) parser.add_argument('--save-every', type=int, default=1000) parser.add_argument('--sequence-ngram-n', type=int, default=4) parser.add_argument('--train-n-steps', type=int, default=10000) parser.add_argument('--validate-every', type=int, default=10000) # training loop parser.add_argument("--adam-epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max-grad-norm', type=int, default=1) parser.add_argument("--max-steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient-accumulation-steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning-rate', type=float, default=6.25e-5) parser.add_argument("--warmup-steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr-schedule', type=str, default='warmup_linear') parser.add_argument('--weight-decay', type=float, default=0.01) parser.add_argument('--lm-coef', type=float, default=0.9) args = parser.parse_args() print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') dataset_paths = { 'train': os.path.join(args.data_base, 'train_tokens_bpe_gpt2.pt'), 'valid': os.path.join(args.data_base, 'valid_tokens_bpe_gpt2.pt'), 'test': os.path.join(args.data_base, 'test_tokens_bpe_gpt2.pt'), } if args.model_load_dir: model = GPT2LMHeadModel.from_pretrained(args.model_load_dir) else: model = GPT2LMHeadModel.from_pretrained(args.model_name) model.to(device) if args.mode == 'eval-singletoken' or args.mode == 'eval-both': eval_singletoken(model, args, dataset_paths) if args.mode == 'eval-completion' or args.mode == 'eval-both': datasets = get_datasets(dataset_paths, max_len=args.batch_size_completion) eval_sampler = SequentialSampler(datasets[args.eval_split]) eval_dataloader = DataLoader(datasets[args.eval_split], sampler=eval_sampler, batch_size=1) model.eval() with torch.no_grad(): all_text_completions = [] bpe_ngram_metrics = Metrics(pad=-1) word_ngram_metrics = Metrics(pad=-1) for i, batch in tqdm(enumerate(eval_dataloader), desc="Evaluating", total=len(eval_dataloader)): input_sequence = batch[0].cuda() if input_sequence.size(1) < args.prefix_length: continue # Predict the completions. batch = batch_input_sequence_by_prefix_length( input_sequence, args.prefix_length) bpe_completions, _ = sample_sequence(model, batch, args.prefix_length, args.continuation_length, args.top_k, args.top_p) bpe_completions = bpe_completions.tolist() # Extract continuations from the predicted completions. bpe_continuations = [] text_continuations = [] for bpe_completion in bpe_completions: bpe_continuations.append( bpe_completion[args.prefix_length:]) text_continuations.append( get_text_continuation(bpe_completion, tokenizer, args)) all_text_completions.append( tokenizer.decode(bpe_completion)) # Only keep continuations with at least one 4-gram # (A short continuation may occur due to predicted whitespace, then tokenizing, despite being # normal length in BPE tokens). text_continuations = [ c for c in text_continuations if len(c) > 3 ] # Update metrics with this batch of continuations. bpe_ngram_metrics.update(bpe_continuations) word_ngram_metrics.update(text_continuations) # Save the (possibly intermediate) metrics. save_completion_metrics(bpe_metrics=bpe_ngram_metrics.report( 'bpe_%s' % args.eval_split), word_metrics=word_ngram_metrics.report( 'word_%s' % args.eval_split), text_completions=all_text_completions, config=model.config.to_dict(), args=args) if args.mode == 'train': if not os.path.exists(os.path.join(args.output_dir, 'best')): os.makedirs(os.path.join(args.output_dir, 'best')) token_loss = mle_loss datasets = get_datasets(dataset_paths, max_len=args.train_batch_size) train_sampler = RandomSampler(datasets['train']) train_seq_dataloader = DataLoader(datasets['train'], sampler=train_sampler, batch_size=1) # Setup optimizer if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len( train_seq_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_seq_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) total_steps = 0 best_ppl = 1e20 for _ in trange(args.num_train_epochs, desc="Epoch"): logging_outputs = [] epoch_loss = 0 epoch_steps = 0 tqdm_bar = tqdm(train_seq_dataloader, desc="Training", total=args.train_n_steps) for step, batch in enumerate(tqdm_bar): optimizer.zero_grad() # Sequence loss if torch.rand(1).item() < args.sequence_tune_rate: if batch[0].size(1) < args.prefix_length: continue loss, batch_metrics = ul_seq(model, batch, args) # Token loss else: loss, batch_metrics = token_loss(model, batch, args) loss.backward() optimizer.step() scheduler.step() epoch_loss += loss.item() epoch_steps += 1 total_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( epoch_loss / epoch_steps, scheduler.get_lr()[0]) logging_outputs.append(batch_metrics) if epoch_steps % args.report_metrics_every == 0: logging_average = CrossEntropyCriterionWCustomMetrics.aggregate_logging_outputs( logging_outputs) temp = SequencePenaltyCriterion.aggregate_logging_outputs( logging_outputs) for k, v in temp.items(): logging_average[k] = v logging_average['ppl'] = 2**logging_average['loss'] print(logging_average) logging_outputs = [] if step == args.train_n_steps: break if epoch_steps % args.save_every == 0: model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) if total_steps % args.validate_every == 0: print("Validating...") validation_outputs = eval_singletoken( model, args, dataset_paths, train_iter=total_steps) if validation_outputs['ppl'] < best_ppl: best_ppl = validation_outputs['ppl'] model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, 'best', WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, 'best', CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary( os.path.join(args.output_dir, 'best')) save_singletoken_metrics(validation_outputs, model.config.to_dict(), args, train_iter=total_steps, best=True)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default='/hdd/lujunyu/dataset/multi_turn_corpus/ubuntu/', type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--task_name", default='ubuntu', type=str, required=False, help="The name of the task to train.") parser.add_argument( "--output_dir", default='/hdd/lujunyu/model/chatbert/ubuntu_base_sp/', type=str, required=False, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--init_model_name", default='bert-base-uncased', type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--data_augmentation", default=False, action='store_true', help="Whether to use augmentation") parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_test", default=True, action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--train_batch_size", default=500, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=100, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_steps", default=0.0, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=1e-3, type=float, help="weight_decay") parser.add_argument("--save_checkpoints_steps", default=5000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=10, help= "Number of updates steps to accumualte before performing a backward/update pass." ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_pretrained(args.init_model_name, num_labels=2) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): if args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.init_model_name, do_lower_case=args.do_lower_case) if args.data_augmentation: train_dataset = UbuntuDatasetForSP(file_path=os.path.join( args.data_dir, "train_augment_3.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer) else: train_dataset = UbuntuDatasetForSP(file_path=os.path.join( args.data_dir, "train.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer) eval_dataset = UbuntuDatasetForSP(file_path=os.path.join( args.data_dir, "valid.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, sampler=RandomSampler(train_dataset), num_workers=4) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=args.eval_batch_size, sampler=SequentialSampler(eval_dataset), num_workers=4) model = BertForSequenceClassification.from_pretrained(args.init_model_name, config=bert_config) model.to(device) num_train_steps = None if args.do_train: num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare optimizer param_optimizer = list(model.named_parameters()) # remove pooler, which is not used thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_steps) else: optimizer = None scheduler = None if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 best_metric = 0.0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients scheduler.step() model.zero_grad() global_step += 1 if (step + 1) % args.save_checkpoints_steps == 0: model.eval() f = open(os.path.join(args.output_dir, 'logits_dev.txt'), 'w') eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() logits_all.append(logits) label_ids = label_ids.cpu().numpy() for logit, label in zip(logits, label_ids): logit = '{},{}'.format(logit[0], logit[1]) f.write('_\t{}\t{}\n'.format(logit, label)) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 f.close() logits_all = np.concatenate(logits_all, axis=0) eval_loss = eval_loss / nb_eval_steps result = evaluate( os.path.join(args.output_dir, 'logits_dev.txt')) result.update({'eval_loss': eval_loss}) output_eval_file = os.path.join(args.output_dir, "eval_results_dev.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ### Save the best checkpoint if best_metric < result['R10@1'] + result['R10@2']: try: ### Remove 'module' prefix when using DataParallel state_dict = model.module.state_dict() except AttributeError: state_dict = model.state_dict() torch.save(state_dict, os.path.join(args.output_dir, "model.pt")) best_metric = result['R10@1'] + result['R10@2'] logger.info('Saving the best model in {}'.format( os.path.join(args.output_dir, "model.pt"))) ### visualize bad cases of the best model logger.info('Saving Bad cases...') visualize_bad_cases(logits=logits_all, input_file_path=os.path.join( args.data_dir, 'valid.txt'), output_file_path=os.path.join( args.output_dir, 'valid_bad_cases.txt')) model.train()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval or not.") parser.add_argument("--eval_on", default="dev", help="Whether to run eval on the dev set or test set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner":NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # n_gpu = torch.cuda.device_count() # n_gpu = len([1, 2, 3, 4, 5, 6, 7]) n_gpu = 1 # disable multi gpu training by me print(n_gpu, device) else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab # Prepare model config = BertConfig.from_pretrained(args.bert_model, num_labels=num_labels, finetuning_task=args.task_name) model = Ner.from_pretrained(args.bert_model, from_tf = False, config = config) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias','LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=[1, 2, 3, 4, 5, 6, 7]) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i : label for i, label in enumerate(label_list,1)} if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_valid_ids,all_lmask_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids,l_mask = batch loss = model(input_ids, segment_ids, input_mask, label_ids,valid_ids,l_mask) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) label_map = {i : label for i, label in enumerate(label_list,1)} model_config = {"bert_model":args.bert_model,"do_lower":args.do_lower_case,"max_seq_length":args.max_seq_length,"num_labels":len(label_list)+1,"label_map":label_map} json.dump(model_config,open(os.path.join(args.output_dir,"model_config.json"),"w")) # Load a trained model and config that you have fine-tuned else: # Load a trained model and vocabulary that you have fine-tuned model = Ner.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if args.eval_on == "dev": eval_examples = processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = processor.get_test_examples(args.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_valid_ids,all_lmask_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i : label for i, label in enumerate(label_list,1)} for input_ids, input_mask, segment_ids, label_ids,valid_ids,l_mask in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask,valid_ids=valid_ids,attention_mask_label=l_mask) logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j,m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_map): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) report = classification_report(y_true, y_pred,digits=4) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def train(model, criterion, dataset, logger, train_csv_logger, val_csv_logger, test_csv_logger, args, epoch_offset): model = model.cuda() # process generalization adjustment stuff adjustments = [float(c) for c in args.generalization_adjustment.split(',')] assert len(adjustments) in (1, dataset['train_data'].n_groups) if len(adjustments)==1: adjustments = np.array(adjustments* dataset['train_data'].n_groups) else: adjustments = np.array(adjustments) train_loss_computer = LossComputer( criterion, is_robust=args.robust, dataset=dataset['train_data'], alpha=args.alpha, gamma=args.gamma, adj=adjustments, step_size=args.robust_step_size, normalize_loss=args.use_normalized_loss, btl=args.btl, sp=args.sp, sup=args.sup, half=args.half, min_var_weight=args.minimum_variational_weight) # BERT uses its own scheduler and optimizer if args.model == 'bert': no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW( optimizer_grouped_parameters, lr=args.lr, eps=args.adam_epsilon) t_total = len(dataset['train_loader']) * args.n_epochs print(f'\nt_total is {t_total}\n') scheduler = WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_steps, t_total=t_total) else: if args.adam: optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = torch.optim.SGD( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) if args.scheduler: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', factor=0.1, patience=5, threshold=0.0001, min_lr=0, eps=1e-08) else: scheduler = None best_val_acc = 0 for epoch in range(epoch_offset, epoch_offset+args.n_epochs): logger.write('\nEpoch [%d]:\n' % epoch) logger.write(f'Training:\n') run_epoch( epoch, model, optimizer, dataset['train_loader'], train_loss_computer, logger, train_csv_logger, args, is_training=True, show_progress=args.show_progress, log_every=args.log_every, scheduler=scheduler) logger.write(f'\nValidation:\n') val_loss_computer = LossComputer( criterion, is_robust=args.robust, dataset=dataset['val_data'], step_size=args.robust_step_size, alpha=args.alpha) run_epoch( epoch, model, optimizer, dataset['val_loader'], val_loss_computer, logger, val_csv_logger, args, is_training=False) # Test set; don't print to avoid peeking if dataset['test_data'] is not None: test_loss_computer = LossComputer( criterion, is_robust=args.robust, dataset=dataset['test_data'], step_size=args.robust_step_size, alpha=args.alpha) run_epoch( epoch, model, optimizer, dataset['test_loader'], test_loss_computer, None, test_csv_logger, args, is_training=False) # Inspect learning rates if (epoch+1) % 1 == 0: for param_group in optimizer.param_groups: curr_lr = param_group['lr'] logger.write('Current lr: %f\n' % curr_lr) if args.scheduler and args.model != 'bert': if args.robust: val_loss, _ = val_loss_computer.compute_robust_loss_greedy(val_loss_computer.avg_group_loss, val_loss_computer.avg_group_loss) else: val_loss = val_loss_computer.avg_actual_loss scheduler.step(val_loss) #scheduler step to update lr at the end of epoch if epoch % args.save_step == 0: torch.save(model, os.path.join(args.log_dir, '%d_model.pth' % epoch)) if args.save_last: torch.save(model, os.path.join(args.log_dir, 'last_model.pth')) if args.save_best: if args.robust or args.reweight_groups: curr_val_acc = min(val_loss_computer.avg_group_acc) else: curr_val_acc = val_loss_computer.avg_acc logger.write(f'Current validation accuracy: {curr_val_acc}\n') if curr_val_acc > best_val_acc: best_val_acc = curr_val_acc torch.save(model, os.path.join(args.log_dir, 'best_model.pth')) logger.write(f'Best model saved at epoch {epoch}\n') if args.automatic_adjustment: gen_gap = val_loss_computer.avg_group_loss - train_loss_computer.exp_avg_loss adjustments = gen_gap * torch.sqrt(train_loss_computer.group_counts) train_loss_computer.adj = adjustments logger.write('Adjustments updated\n') for group_idx in range(train_loss_computer.n_groups): logger.write( f' {train_loss_computer.get_group_name(group_idx)}:\t' f'adj = {train_loss_computer.adj[group_idx]:.3f}\n') logger.write('\n')
def run_train(args): # --------- data processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} train_data = processor.get_train(config['data_dir'] / f"{args.data_name}.train.pkl") train_examples = processor.create_examples(lines=train_data, example_type='train', cached_examples_file=config[ 'data_dir'] / f"cached_train_examples_{args.arch}") train_features = processor.create_features(examples=train_examples, max_seq_len=args.train_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_train_features_{}_{}".format( args.train_max_seq_len, args.arch )) train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_data = processor.get_dev(config['data_dir'] / f"{args.data_name}.valid.pkl") valid_examples = processor.create_examples(lines=valid_data, example_type='valid', cached_examples_file=config[ 'data_dir'] / f"cached_valid_examples_{args.arch}") valid_features = processor.create_features(examples=valid_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_valid_features_{}_{}".format( args.eval_max_seq_len, args.arch )) valid_dataset = processor.create_dataset(valid_features) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) # ------- model logger.info("initializing model") if args.resume_path: args.resume_path = Path(args.resume_path) model = BertForMultiLable.from_pretrained(args.resume_path, num_labels=len(label_list)) else: model = BertForMultiLable.from_pretrained(config['bert_model_dir'], num_labels=len(label_list)) t_total = int(len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # ---- callbacks logger.info("initializing callbacks") train_monitor = TrainingMonitor(file_dir=config['figure_dir'], arch=args.arch) model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'],mode=args.mode, monitor=args.monitor,arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num Epochs = %d", args.epochs) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) trainer = Trainer(n_gpu=args.n_gpu, model=model, epochs=args.epochs, logger=logger, criterion=BCEWithLogLoss(), optimizer=optimizer, lr_scheduler=lr_scheduler, early_stopping=None, training_monitor=train_monitor, fp16=args.fp16, resume_path=args.resume_path, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, batch_metrics=[AccuracyThresh(thresh=0.5)], epoch_metrics=[AUC(average='micro', task_type='binary'), MultiLabelReport(id2label=id2label)]) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, seed=args.seed)
def train( *, input_dir, output_dir, model, config, tokenizer, train_batch_size, valid_batch_size, num_train_epochs, device, n_gpu, max_seq_length, labels, lr, adam_eps, ): """ Train the passed model on the given data. Return training/validation metrics and save the model to the specified output directory. """ # Prepare optimizer and scheduler # Adapted from https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py optimizer = AdamW(model.parameters(), lr=lr, eps=adam_eps) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=0, t_total=num_train_epochs * num_batches(input_dir / "train.tsv", train_batch_size), ) model.zero_grad() for epoch in range(num_train_epochs): print(f"Start epoch {epoch}") model.train() train_loss = 0.0 train_count = 0 for X, y in tsv_to_encoded_batches(input_dir / "train.tsv", tokenizer, labels, train_batch_size, max_seq_length): X = X.to(device) y = y.to(device) outputs = model(input_ids=X, labels=y) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # average loss on parallel training loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) train_loss += loss.detach().item() train_count += X.size(0) scheduler.step() optimizer.step() model.zero_grad() print(f"Epoch {epoch} train loss: {train_loss / train_count}") model.eval() valid_loss = 0.0 valid_count = 0 valid_correct = 0 for X, y in tsv_to_encoded_batches(input_dir / "dev.tsv", tokenizer, labels, valid_batch_size, max_seq_length): X = X.to(device) y = y.to(device) with torch.no_grad(): outputs = model(input_ids=X, labels=y) loss = outputs[0] logits = outputs[1] # Keep preds on the device rather than moving to CPU, since we'll compare to y, # which is on the device preds = torch.argmax(logits.detach(), 1) if n_gpu > 1: loss = loss.mean() # average loss on parallel training valid_loss += loss.item() valid_count += X.size(0) valid_correct += (y == preds).sum().item() print(f"Epoch {epoch} validation loss: {valid_loss / valid_count}") checkpoint_dir = output_dir / "checkpoint" checkpoint_dir.mkdir(exist_ok=True, parents=True) if hasattr(model, "module"): # DataParallel object -- unpack the module before saving model.module.save_pretrained(checkpoint_dir) else: # Plain pytorch_transformers model model.save_pretrained(checkpoint_dir) tokenizer.save_pretrained(checkpoint_dir) config.save_pretrained(checkpoint_dir) return { "mean_train_loss": train_loss / train_count, "mean_valid_loss": valid_loss / valid_count, "valid_accuracy": valid_correct / valid_count, }
param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=t_total) # PyTorch scheduler if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch)
def main(parser): # Config args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) # data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model ptr_tokenizer = SentencepieceTokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token_to_idx = vocab_of_gluonnlp.token_to_idx model_config.vocab_size = len(token_to_idx) vocab = Vocabulary(token_to_idx=token_to_idx) print("len(token_to_idx): ", len(token_to_idx)) with open(model_dir / "token2idx_vocab.json", 'w', encoding='utf-8') as f: json.dump(token_to_idx, f, ensure_ascii=False, indent=4) # save vocab & tokenizer with open(model_dir / "vocab.pkl", 'wb') as f: pickle.dump(vocab, f) # load vocab & tokenizer with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) ner_formatter = NamedEntityRecognitionFormatter(vocab=vocab, tokenizer=tokenizer, maxlen=model_config.maxlen, model_dir=model_dir) # Train & Val Datasets cwd = Path.cwd() data_in = cwd / "data_in" train_data_dir = data_in / "NER-master" / "말뭉치 - 형태소_개체명" print("model_config.batch_size: ", model_config.batch_size) tr_clf_ds = NamedEntityRecognitionDataset(train_data_dir=train_data_dir, model_dir=model_dir) tr_clf_ds.set_transform_fn(transform_source_fn=ner_formatter.transform_source_fn, transform_target_fn=ner_formatter.transform_target_fn) tr_clf_dl = DataLoader(tr_clf_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False) # Model model = KobertBiGRUCRF(config=model_config, num_classes=len(tr_clf_ds.ner_to_index)) model.train() # optim train_examples_len = len(tr_clf_ds) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs t_total = len(tr_clf_dl) // model_config.gradient_accumulation_steps * model_config.epochs optimizer = AdamW(optimizer_grouped_parameters, lr=model_config.learning_rate, eps=model_config.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=model_config.warmup_steps, t_total=t_total) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') n_gpu = torch.cuda.device_count() # if n_gpu > 1: # model = torch.nn.DataParallel(model) model.to(device) # save tb_writer = SummaryWriter('{}/runs'.format(model_dir)) checkpoint_manager = CheckpointManager(model_dir) summary_manager = SummaryManager(model_dir) best_val_loss = 1e+10 best_train_acc = 0 # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(tr_clf_ds)) logger.info(" Num Epochs = %d", model_config.epochs) logger.info(" Instantaneous batch size per GPU = %d", model_config.batch_size) # logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", # args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", model_config.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_dev_acc, best_dev_loss = 0.0, 99999999999.0 best_steps = 0 model.zero_grad() set_seed() # Added here for reproductibility (even between python 2 and 3) # Train train_iterator = trange(int(model_config.epochs), desc="Epoch") for _epoch, _ in enumerate(train_iterator): epoch_iterator = tqdm(tr_clf_dl, desc="Iteration") # , disable=args.local_rank not in [-1, 0] epoch = _epoch for step, batch in enumerate(epoch_iterator): model.train() x_input, token_type_ids, y_real = map(lambda elm: elm.to(device), batch) log_likelihood, sequence_of_tags = model(x_input, token_type_ids, y_real) # loss: negative log-likelihood loss = -1 * log_likelihood if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if model_config.gradient_accumulation_steps > 1: loss = loss / model_config.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), model_config.max_grad_norm) tr_loss += loss.item() if (step + 1) % model_config.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 with torch.no_grad(): sequence_of_tags = torch.tensor(sequence_of_tags) print("sequence_of_tags: ", sequence_of_tags) print("y_real: ", y_real) print("loss: ", loss) print("(sequence_of_tags == y_real): ", (sequence_of_tags == y_real)) mb_acc = (sequence_of_tags == y_real).float()[y_real != vocab.PAD_ID].mean() tr_acc = mb_acc.item() tr_loss_avg = tr_loss / global_step tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc} # if step % 50 == 0: print('epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'.format(epoch + 1, global_step, tr_summary['loss'], tr_summary['acc'])) if model_config.logging_steps > 0 and global_step % model_config.logging_steps == 0: # Log metrics if model_config.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well pass tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / model_config.logging_steps, global_step) logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss) / model_config.logging_steps), str(global_step)) logging_loss = tr_loss if model_config.save_steps > 0 and global_step % model_config.save_steps == 0: # Save model checkpoint output_dir = os.path.join(model_config.output_dir, 'epoch-{}'.format(epoch + 1)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("Saving model checkpoint to %s", output_dir) state = {'global_step': global_step + 1, 'model_state_dict': model.state_dict(), 'opt_state_dict': optimizer.state_dict()} summary = {'train': tr_summary} summary_manager.update(summary) summary_manager.save('summary.json') is_best = tr_acc >= best_train_acc # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야) # Save if is_best: best_train_acc = tr_acc checkpoint_manager.save_checkpoint(state, 'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1, global_step, tr_acc)) else: torch.save(state, os.path.join(output_dir, 'model-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1, global_step, tr_acc))) tb_writer.close() logger.info(" global_step = %s, average loss = %s", global_step, tr_loss / global_step) return global_step, tr_loss / global_step, best_steps
def main(): parser = argparse.ArgumentParser("") parser.add_argument("--model", type=str, default='') parser.add_argument("--resume", action='store_true') parser.add_argument("--eval", action='store_true') parser.add_argument("--batch_size", type=int, default=CFG.batch_size) parser.add_argument("--nepochs", type=int, default=CFG.num_train_epochs) parser.add_argument("--wsteps", type=int, default=CFG.warmup_steps) parser.add_argument("--nlayers", type=int, default=CFG.num_hidden_layers) parser.add_argument("--nahs", type=int, default=CFG.num_attention_heads) parser.add_argument("--seed", type=int, default=7) parser.add_argument("--lr", type=float, default=CFG.learning_rate) parser.add_argument("--dropout", type=float, default=CFG.dropout) parser.add_argument("--types", nargs='+', type=str, default=['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN'], help='3JHC,2JHC,1JHC,3JHH,2JHH,3JHN,2JHN,1JHN') parser.add_argument("--train_file", default="train_mute_cp") parser.add_argument("--test_file", default="test_mute_cp") parser.add_argument("--pseudo_path", default="") parser.add_argument("--pseudo", action='store_true') parser.add_argument("--gen_pseudo", action='store_true') parser.add_argument("--use_all", action='store_true') parser.add_argument("--structure_file", default="structures_mu") parser.add_argument("--contribution_file", default="scalar_coupling_contributions") args = parser.parse_args() print(args) CFG.batch_size=args.batch_size CFG.num_train_epochs=args.nepochs CFG.warmup_steps=args.wsteps CFG.num_hidden_layers=args.nlayers CFG.num_attention_heads=args.nahs CFG.learning_rate=args.lr CFG.dropout=args.dropout CFG.seed = args.seed print(CFG.__dict__) random.seed(CFG.seed) np.random.seed(CFG.seed) torch.manual_seed(CFG.seed) #if not args.eval: if True: train_df = load_csv(args.train_file) structures_df = load_csv(args.structure_file) structures_df[['x', 'y', 'z']] -= structures_df.groupby('molecule_name')[['x', 'y', 'z']].transform('mean') contributions_df = load_csv(args.contribution_file) train_df = train_df.merge(contributions_df, how='left') train_df = normalize_cols(train_df, ['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) train_df = add_extra_features(train_df, structures_df) train_df = train_df.fillna(1e08) n_mols = train_df['molecule_name'].nunique() train_df, valid_df = train_test_split(train_df, 5000 ) # only molecules with the args.types print(train_df['molecule_name'].nunique()) mol_names_with_at = train_df[train_df['type'].isin(args.types)]['molecule_name'].unique() train_df = train_df[train_df['molecule_name'].isin(mol_names_with_at)].reset_index(drop=True) print(train_df['molecule_name'].nunique()) # Print the 5 rows of valid_df to verify whether the valid_df is the same as the previous experiment. print(valid_df.head(5)) if args.pseudo: test_df = load_csv(args.test_file) logger.info(f'loading dataset - {args.pseudo_path} ...') test_pseudo_df = pd.read_csv(args.pseudo_path) #mol_names_jhn = train_df[test_df['type'].isin(['1JHN', '2JHN', '3JHN'])]['molecule_name'].unique() #test_df = test_df[test_df['molecule_name'].isin(mol_names_jhn)].reset_index(drop=True) test_df = add_extra_features(test_df, structures_df) test_df = test_df.set_index('id') test_pseudo_df = test_pseudo_df.set_index('id') test_df[['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']] = test_pseudo_df[['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']] test_df = test_df.reset_index() #test_df = normalize_target(test_df) test_df = normalize_cols(test_df, ['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) #test_df = test_df.assign(fc=1e08, sd=1e08, pso=1e08, dso=1e08) train_df['weight'] = 1.0 valid_df['weight'] = 1.0 test_df['weight'] = 1.0 n_mols = test_df['molecule_name'].nunique() train_df = train_df.append(test_df).reset_index(drop=True) else: train_df['weight'] = 1.0 valid_df['weight'] = 1.0 if args.use_all: train_df = train_df.append(valid_df) print(f' n_train:{len(train_df)}, n_valid:{len(valid_df)}') config = BertConfig( 3, # not used hidden_size=CFG.hidden_size, num_hidden_layers=CFG.num_hidden_layers, num_attention_heads=CFG.num_attention_heads, intermediate_size=CFG.intermediate_size, hidden_dropout_prob=CFG.dropout, attention_probs_dropout_prob=CFG.dropout, ) model = cust_model.SelfAttn(config) if args.model != "": print("=> loading checkpoint '{}'".format(args.model)) checkpoint = torch.load(args.model) CFG.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.model, checkpoint['epoch'])) model.cuda() def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) print('parameters: ', count_parameters(model)) n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) # to produce the submission.csv if args.eval: test_df = load_csv(args.test_file) structures_df = load_csv(args.structure_file) structures_df[['x', 'y', 'z']] -= structures_df.groupby('molecule_name')[['x', 'y', 'z']].transform('mean') test_df = add_extra_features(test_df, structures_df) test_df = test_df.assign(fc=1e08, sd=1e08, pso=1e08, dso=1e08) test_df['scalar_coupling_constant'] = 0 test_df['weight'] = 1.0 test_db = db.MolDB(test_df, CFG.max_seq_length) test_loader = DataLoader( test_db, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers) res_df = validate(test_loader, model, args.types) res_df = unnormalize_cols(res_df, cols=['fc', 'sd', 'pso', 'dso']) res_df = unnormalize_target(res_df, 'prediction1') if args.gen_pseudo: res_df['scalar_coupling_constant'] = res_df['prediction1'] res_df = res_df[res_df['id']>-1].sort_values('id') res_df[['id', 'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']].to_csv(f'pseudo_{CFG.seed}.csv', index=False) return res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1) res_df['prediction']= res_df[['prediction1','prediction4']].mean(1) res_df['scalar_coupling_constant'] = res_df['prediction'] res_df = res_df[res_df['id']>-1].sort_values('id') os.makedirs('output', exist_ok=True) res_df[['id', 'scalar_coupling_constant']].to_csv(f'output/submission_{CFG.seed}.csv', index=False) return train_db = db.MolDB(train_df, CFG.max_seq_length) print('preloading dataset ...') train_db = db.MolDB_FromDB(train_db, 10) valid_db = db.MolDB(valid_df, CFG.max_seq_length) num_train_optimization_steps = int( len(train_db) / CFG.batch_size / CFG.gradient_accumulation_steps) * (CFG.num_train_epochs-CFG.start_epoch) print('num_train_optimization_steps', num_train_optimization_steps) train_loader = DataLoader( train_db, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True) val_loader = DataLoader( valid_db, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=CFG.learning_rate, weight_decay=CFG.weight_decay, ) scheduler = WarmupLinearSchedule(optimizer, CFG.warmup_steps, t_total=num_train_optimization_steps ) def get_lr(): return scheduler.get_lr()[0] if args.model != "": if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) #for param_group in optimizer.param_groups: # param_group['lr'] = CFG.learning_rate mae_log_df = checkpoint['mae_log'] del checkpoint else: mae_log_df = pd.DataFrame(columns=(['EPOCH']+['LR']+args.types + ['OVERALL']) ) os.makedirs('log', exist_ok=True) res_df = validate(val_loader, model, args.types) res_df = unnormalize_cols(res_df, cols=['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) res_df = unnormalize_target(res_df, 'prediction1') res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1) res_df['prediction']= res_df[['prediction1','prediction4']].mean(1) res_df.to_csv(f'log/valid_df_{"_".join(args.types)}.csv', index=False) overall_mae, maes = metric(res_df, args.types) print(overall_mae, maes) curr_lr = get_lr() print(f'initial learning rate:{curr_lr}') for epoch in range(CFG.start_epoch, CFG.num_train_epochs): # train for one epoch #print(adjust_learning_rate(optimizer, epoch)) train(train_loader, model, optimizer, epoch, args.types, scheduler) if epoch % CFG.test_freq == 0: res_df = validate(val_loader, model, args.types) res_df = unnormalize_cols(res_df, cols=['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) res_df = unnormalize_target(res_df, 'prediction1') res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1) res_df['prediction']= res_df[['prediction1','prediction4']].mean(1) res_df.to_csv(f'log/valid_df_{"_".join(args.types)}.csv', index=False) overall_mae, maes = metric(res_df, args.types) # write log file mae_row = dict([(typ, [mae]) for typ, mae in maes.items() if typ in args.types]) mae_row.update({'EPOCH':(epoch),'OVERALL':overall_mae, 'LR':curr_lr}) mae_log_df = mae_log_df.append(pd.DataFrame(mae_row), sort=False) print(mae_log_df.tail(20)) mae_log_df.to_csv(f'log/{"_".join(args.types)}.csv', index=False) #scheduler.step(overall_mae) curr_lr = get_lr() print(f'set the learning_rate: {curr_lr}') # evaluate on validation set batch_size = CFG.batch_size pseudo_path = '' if not args.pseudo else '_' + args.pseudo_path curr_model_name = (f'b{batch_size}_l{config.num_hidden_layers}_' f'mh{config.num_attention_heads}_h{config.hidden_size}_' f'd{CFG.dropout}_' f'ep{epoch}_{"_".join(args.types)}_s{CFG.seed}{pseudo_path}.pt') model_to_save = model.module if hasattr(model, 'module') else model # Only save the cust_model it-self save_checkpoint({ 'epoch': epoch + 1, 'arch': 'transformer', 'state_dict': model_to_save.state_dict(), 'mae_log': mae_log_df, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, FINETUNED_MODEL_PATH, curr_model_name ) print('done')
def train(args, train_dataset, model_vae, encoder_tokenizer, decoder_tokenizer, table_name): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) # model_encoder, model_decoder, model_connector = model_vae.encoder, model_vae.decoder, model_vae.linear no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model_vae.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model_vae.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model_vae, optimizer = amp.initialize(model_vae, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model_vae = torch.nn.DataParallel(model_vae, device_ids=range(args.n_gpu)).to(args.device) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model_vae = torch.nn.parallel.DistributedDataParallel(model_vae, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) model_vae = model_vae.module if hasattr(model_vae, 'module') else model_vae # Take care of distributed/parallel training # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model_vae.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) n_iter = int(args.num_train_epochs) * len(train_dataloader) beta_t_list = frange_cycle_zero_linear(n_iter, start=0.0, stop=args.beta, n_cycle=1, ratio_increase=args.ratio_increase, ratio_zero=args.ratio_zero) tmp_list = [] set_seed(args) # Added here for reproducibility (even between python 2 and 3) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): tokenized_text0, tokenized_text1, tokenized_text_lengths = batch # tokenized_text0 = tokenized_text0.to(args.device) # tokenized_text1 = tokenized_text1.to(args.device) # prepare input-output data for reconstruction # pdb.set_trace() max_len_values, _ = tokenized_text_lengths.max(0) tokenized_text0 = tokenized_text0[:,:max_len_values[0]] tokenized_text1 = tokenized_text1[:,:max_len_values[1]] inputs, labels = mask_tokens(tokenized_text0, encoder_tokenizer, args) if args.mlm else (tokenized_text0, tokenized_text1) labels = tokenized_text1 tokenized_text1 = tokenized_text1.to(args.device) inputs = inputs.to(args.device) labels = labels.to(args.device) model_vae.train() beta_t = beta_t_list[step + epoch*len(epoch_iterator)] model_vae.args.beta = beta_t if beta_t == 0.0: model_vae.args.fb_mode = 0 else: model_vae.args.fb_mode = 1 if args.use_deterministic_connect: model_vae.args.fb_mode = 2 loss_rec, loss_kl, loss = model_vae(inputs, labels) # pdb.set_trace() # Chunyuan: loss_rec size is [4], while latent_z size is [12] if args.n_gpu > 1: loss_rec = loss_rec.mean() # mean() to average on multi-gpu parallel training loss_kl = loss_kl.mean() loss = loss.mean() if args.use_philly: print("PROGRESS: {}%".format(round(100 * (step + epoch*len(epoch_iterator) ) /(int(args.num_train_epochs) * len(epoch_iterator)) , 4))) print("EVALERR: {}%".format(loss_rec)) epoch_iterator.set_description( ( f'iter: {step + epoch*len(epoch_iterator) }; loss: {loss.item():.3f}; ' f'loss_rec: {loss_rec.item():.3f}; loss_kl: {loss_kl.item():.3f}; ' f'beta: {model_vae.args.beta:.3f}' ) ) if global_step % 5 == 0: row = { 'PartitionKey': 'MILU_Rule_Rule_Template', 'RowKey': str(datetime.now()), 'ExpName' : args.ExpName, 'iter': str( step + epoch*len(epoch_iterator) ), 'loss': str( loss.item()), 'loss_rec': str(loss_rec.item()), 'loss_kl': str(loss_kl.item()), 'beta': str(model_vae.args.beta) } # pdb.set_trace() ts.insert_entity(table_name, row) # pdb.set_trace() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model_vae.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model_vae.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model_vae, encoder_tokenizer, decoder_tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save encoder model checkpoint output_encoder_dir = os.path.join(args.output_dir, 'checkpoint-encoder-{}'.format(global_step)) if not os.path.exists(output_encoder_dir): os.makedirs(output_encoder_dir) model_encoder_to_save = model_vae.module.encoder if hasattr(model_vae, 'module') else model_vae.encoder # Take care of distributed/parallel training if args.use_philly: save_solid = False while not save_solid: try: model_encoder_to_save.save_pretrained(output_encoder_dir) torch.save(args, os.path.join(output_encoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_encoder_dir) save_solid = True except: pass else: model_encoder_to_save.save_pretrained(output_encoder_dir) torch.save(args, os.path.join(output_encoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_encoder_dir) # Save decoder model checkpoint output_decoder_dir = os.path.join(args.output_dir, 'checkpoint-decoder-{}'.format(global_step)) if not os.path.exists(output_decoder_dir): os.makedirs(output_decoder_dir) model_decoder_to_save = model_vae.module.decoder if hasattr(model_vae, 'module') else model_vae.decoder # Take care of distributed/parallel training if args.use_philly: save_solid = False while not save_solid: try: model_decoder_to_save.save_pretrained(output_decoder_dir) torch.save(args, os.path.join(output_decoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_decoder_dir) save_solid = True except: pass else: model_decoder_to_save.save_pretrained(output_decoder_dir) torch.save(args, os.path.join(output_decoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_decoder_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=3) model = BertForSequenceClassification.from_pretrained( self.model_name_or_path, self.args, config=config) model.to(self.device) data_splitList = DATABDCI.load_data(os.path.join( self.data_dir, 'train.csv'), n_splits=5) for split_index, each_data in enumerate(data_splitList): logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader( each_data) num_train_optimization_steps = self.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and ( step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) # Run prediction for full data model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracyBDCI(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join( self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( self.output_dir, "pytorch_model_{}.bin".format(split_index)) torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) if self.do_test: del model gc.collect() self.do_train = False data = DATABDCI(debug=False, data_dir='/home/lsy2018/文本匹配/DATA/DATA_BDCI/', data_process_output= '/home/lsy2018/文本匹配/DATA/DATA_BDCI/data_1014/') model = BertForSequenceClassification.from_pretrained( os.path.join(self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = data.read_examples(os.path.join( self.data_dir, file), is_training=False) print('exa', len(eval_examples)) eval_features = data.convert_examples_to_features( eval_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) if flag == 'dev': print(flag, accuracyBDCI(logits, gold_labels)) if flag == 'test': df = pd.read_csv(os.path.join(self.data_dir, file), names=['id', 'content', 'title', 'label']) predict = np.argmax(logits, axis=1).tolist() print(df.shape[0]) print(len(predict)) df['labelpre'] = predict df[['id', 'labelpre' ]].to_csv(os.path.join(self.output_dir, "sub.csv"), index=False, header=False)
def train(model, tra_data, dev_data, tra_word_vocab, config): optimizer = AdamW(model.parameters(), lr=config.bert_lr, correct_bias=config.correct_bias, weight_decay=config.weight_decay) tra_word_data_iter = create_batch(tra_data, tra_word_vocab, config.batch_size, config, shuffle=False) dev_word_data_iter = create_batch(dev_data, tra_word_vocab, config.dev_batch_size, config, shuffle=False) random_word_iter = data_split(tra_word_data_iter, config.n_fold) tra_word_data_iter, dev_database = database(random_word_iter, config.k, config) # Get start! global_step = 0 best_acc = 0 best_tra_acc = 0 for epoch in range(0, config.epoch): score = 0 print('\nThe epoch is starting.') epoch_start_time = time.time() batch_iter = 0 batch_num = int(len(tra_word_data_iter)) print('The epoch is :', str(epoch)) if config.use_lr_decay: optimizer = decay_learning_rate(config, optimizer, epoch) print("now word_ga lr is {}".format(optimizer.param_groups[0].get("lr")), '\n') for word_batch in tra_word_data_iter: start_time = time.time() model.train() batch_size = tra_word_data_iter[0][0].size(0) / 2 src_premise_matrix, src_hypothesis_matrix, p_mask, h_mask, tag_matrix = word_batch[0], \ word_batch[1], \ word_batch[2], \ word_batch[3], \ word_batch[4] logit_a, logit_b = model(src_premise_matrix, src_hypothesis_matrix, p_mask, h_mask) loss, correct = tri_loss(logit_a, logit_b, config) loss = loss / config.update_every loss.backward() loss_value = loss.item() accuracy = 100.0 * int(correct) / batch_size during_time = float(time.time() - start_time) print('Step:{}, Epoch:{}, batch_iter:{}, accuracy:{:.4f}({}/{}),' 'time:{:.2f}, loss:{:.6f}'.format(global_step, epoch, batch_iter, accuracy, correct, batch_size, during_time, loss_value)) batch_iter += 1 if batch_iter % config.update_every == 0 or batch_iter == batch_num: if config.clip_max_norm_use: nn.utils.clip_grad_norm_(model.parameters(), max_norm=10) optimizer.step() optimizer.zero_grad() global_step += 1 score += correct if batch_iter % config.test_interval == 0 or batch_iter == batch_num: dev_score = evaluate(model, dev_data, dev_word_data_iter, config) if best_acc < dev_score: print('The best dev is' + str(dev_score)) best_acc = dev_score if os.path.exists(config.save_model_path): torch.save(model.state_dict(), config.bert_model_pkl) else: os.makedirs(config.save_model_path) torch.save(model.state_dict(), config.bert_model_pkl) epoch_time = float(time.time() - epoch_start_time) tra_score = 100.0 * score / len(tra_data) if tra_score > best_tra_acc: best_tra_acc = tra_score print('the best_train score is:{}({}/{})'.format(tra_score, score, len(tra_data))) print("epoch_time is:", epoch_time)
def train(args, train_dataset, model, tokenizer): """ Train the model. """ tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError('Please install apex from https://www.github.com/nvidia/apex to use fp16 training.') model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info('***** Running training *****') logger.info(' Num examples = %d', len(train_dataset)) logger.info(' Num Epochs = %d', args.num_train_epochs) logger.info(' Instantaneous batch size per GPU = %d', args.per_gpu_train_batch_size) logger.info(' Total train batch size (w. parallel & accumulation) = %d', args.train_batch_size * args.gradient_accumulation_steps) logger.info(' Gradient Accumulation steps = %d', args.gradient_accumulation_steps) logger.info(' Total optimization steps = %d', t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc='Epoch') set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc='Iteration') for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3], 'hand_features': batch[4]} outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in pytorch_transformers (see doc) if args.n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_los(loss.optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.evaluate_during_training: result = evaluate(args, model, tokenizer) for key, value in result.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss-logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir) torch.save(args, 'training_args.bin') logger.info('Saving model checkpoint to %s', output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) data_splitList = DATACQA.load_data(os.path.join( self.data_dir, 'train.csv'), n_splits=5) for split_index, each_data in enumerate(data_splitList): # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels) model = BertForSequenceClassification.from_pretrained( self.model_name_or_path, self.args, config=config) model.to(self.device) logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader( each_data) num_train_optimization_steps = self.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and ( step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] scores = [] questions = [x.text_a for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) # Run prediction for full data model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) scores.append(logits) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) scores = np.concatenate(scores, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracyCQA(inference_logits, gold_labels) eval_mrr = compute_MRR_CQA(scores, gold_labels, questions) eval_5R20 = compute_5R20(scores, gold_labels, questions) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'eval_MRR': eval_mrr, 'eval_5R20': eval_5R20, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join( self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( self.output_dir, "pytorch_model_{}.bin".format(split_index)) torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) del model gc.collect()
param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] # This variable contains all of the hyperparemeter information our training loop needs optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5) # Function to calculate the accuracy of our predictions vs labels def flat_accuracy(preds, labels): pred_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() return np.sum(pred_flat == labels_flat) / len(labels_flat) # Store our loss and accuracy for plotting train_loss_set = [] # Number of training epochs (authors recommend between 2 and 4) epochs = 4 # trange is a tqdm wrapper around the normal python range for ep in trange(epochs, desc="Epoch"):
def train(args, train_dataset, model, tokenizer, label_2test_array): """ Train the model """ num_labels = len(label_2test_array) print('num_labels {}'.format(num_labels)) if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) ## track best loss on eval set ?? eval_loss = np.inf last_best = 0 break_early = False set_seed( args) # Added here for reproducibility (even between python 2 and 3) for epoch_counter in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # inputs, labels, attention_mask = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) ## !!! WE ARE NOT GOING TO TRAIN MASKED-LM max_len_in_batch = int(torch.max(torch.sum( batch[3], 1))) ## only need max len of AA input_ids_aa = batch[1][:, 0:max_len_in_batch].to(args.device) input_ids_label = batch[2].to(args.device) ## also pass in SEP attention_mask = torch.cat( (batch[3][:, 0:max_len_in_batch], torch.ones(input_ids_label.shape, dtype=torch.long)), dim=1).to(args.device) labels = batch[0].to( args.device) ## already in batch_size x num_label ## must append 0 positions to the front, so that we mask out AA labels_mask = torch.cat( (torch.zeros( input_ids_aa.shape), torch.ones(input_ids_label.shape)), dim=1 ).to( args.device ) ## SEP is at last position on label size ??? should there be one ?? # labels_mask[:,-1] = 0 ## must mask SEP in the label side # labels_mask = labels_mask.to(args.device) ## test all labels ppi_vec = batch[4].unsqueeze(1).expand( labels.shape[0], max_len_in_batch + num_labels, 256).to(args.device) ## make 3D batchsize x 1 x dim if args.aa_type_emb: aa_type = batch[5][:, 0:max_len_in_batch].to(args.device) else: aa_type = None model.train() # call to the @model # def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, # position_ids=None, head_mask=None, attention_mask_label=None): outputs = model(0, input_ids_aa=input_ids_aa, input_ids_label=input_ids_label, token_type_ids=aa_type, attention_mask=attention_mask, labels=labels, position_ids=None, attention_mask_label=labels_mask, prot_vec=ppi_vec ) # if args.mlm else model(inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # if (epoch_counter>0) and args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # # Save model checkpoint # output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) # if not os.path.exists(output_dir): # os.makedirs(output_dir) # model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training # model_to_save.save_pretrained(output_dir) # torch.save(args, os.path.join(output_dir, 'training_args.bin')) # logger.info("Saving model checkpoint to %s", output_dir) # if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # # Log metrics # if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well # results = evaluate(args, model, tokenizer,label_2test_array) # for key, value in results.items(): # tb_writer.add_scalar('eval_{}'.format(key), value, global_step) # tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) # tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) # logging_loss = tr_loss if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break ## end 1 epoch print('\n\neval end epoch {}'.format(epoch_counter)) ## to save some time, let's just save at end of epoch output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) results = evaluate(args, model, tokenizer, label_2test_array, prefix=str(global_step)) # for key, value in results.items(): # tb_writer.add_scalar('eval_{}'.format(key), value, global_step) # tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) # tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) # logging_loss = tr_loss if results['eval_loss'] < eval_loss: eval_loss = results['eval_loss'] last_best = epoch_counter break_early = False print( '\nupdate lowest loss on epoch {}, {}\nreset break_early to False, see break_early variable {}' .format(epoch_counter, eval_loss, break_early)) else: if epoch_counter - last_best > 5: ## break counter after 5 epoch # break ## break early break_early = True print( 'epoch {} set break_early to True, see break_early variable {}' .format(epoch_counter, break_early)) if break_early: train_iterator.close() print("**** break early ****") break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataloader, model_vae, encoder_tokenizer, decoder_tokenizer, table_name): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() n_gpu = torch.cuda.device_count() args.train_batch_size = args.per_gpu_train_batch_size * max(1, n_gpu) # train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) # model_encoder, model_decoder, model_connector = model_vae.encoder, model_vae.decoder, model_vae.linear no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model_vae.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model_vae.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model_vae, optimizer = amp.initialize(model_vae, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model_vae = torch.nn.DataParallel(model_vae, device_ids=range(args.n_gpu)).to( args.device) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model_vae = torch.nn.parallel.DistributedDataParallel( model_vae, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) files = Path(args.train_data_file) num_files = len(list(files.glob('*seq64*.json'))) # Train! logger.info("***** Running training *****") logger.info(" Num files = %d", num_files) n_gpu = torch.cuda.device_count() logger.info(" Num examples of first file = %d", train_dataloader.num_examples) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Num GPUs = %d", n_gpu) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.per_gpu_train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model_vae.zero_grad() num_train_epochs_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) n_iter_per_file = len(train_dataloader) / n_gpu n_iter = int(args.num_train_epochs * n_iter_per_file * num_files) beta_t_list = frange_cycle_zero_linear(n_iter, start=0.0, stop=args.beta, n_cycle=10, ratio_increase=args.ratio_increase, ratio_zero=args.ratio_zero) beta_t = 0.0 tmp_list = [] # dict_token_length = defaultdict(int) set_seed( args) # Added here for reproducibility (even between python 2 and 3) for epoch in num_train_epochs_iterator: train_dataloader.reset() for idx_file in range(num_files - 1): logger.info(f"Epoch {epoch}, File idx {train_dataloader.file_idx}") epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): tokenized_text0, tokenized_text1, tokenized_text_lengths = batch # dict_token_length[ tokenized_text_lengths[0,0].item() ] += 1 if (tokenized_text0 > len(encoder_tokenizer) - 1).sum().item( ) > 0.0 or (tokenized_text0 < 0).sum().item() > 0.0 or ( tokenized_text1 > len(decoder_tokenizer) - 1).sum().item() > 0.0 or ( tokenized_text1 < 0).sum().item() > 0.0: # pdb.set_trace() logger.info(f"BERT tokens: {tokenized_text0}") logger.info(f"GPT2 tokens: {tokenized_text1}") continue # continue # prepare input-output data for reconstruction inputs, labels = tokenized_text0.to( args.device), tokenized_text1.to(args.device) model_vae.train() if args.use_beta_schedule: try: beta_t = beta_t_list[step + idx_file * n_iter_per_file] except: beta_t = 0.0 model_vae.module.args.beta = beta_t if beta_t == 0.0: model_vae.module.args.fb_mode = 0 else: model_vae.module.args.fb_mode = 1 # save the mini-batch with bugs if not os.path.exists(args.output_dir) and args.local_rank in [ -1, 0 ]: os.makedirs(args.output_dir) torch.save( batch, os.path.join(args.output_dir, f'batch_debug_{step}.pt')) loss_rec, loss_kl, loss = model_vae(inputs, labels) loss_rec = loss_rec.mean( ) # mean() to average on multi-gpu parallel training loss_kl = loss_kl.mean() loss = loss.mean() if args.use_philly: if args.local_rank in [-1, 0]: print("PROGRESS: {}%".format( round( 100 * (step + idx_file * n_iter_per_file) / n_iter, 4))) print("EVALERR: {}%".format(loss_rec)) epoch_iterator.set_description(( f'iter: {step + epoch*len(epoch_iterator) }; file:{idx_file}; loss: {loss.item():.3f}; ' f'loss_rec: {loss_rec.item():.3f}; loss_kl: {loss_kl.item():.3f}; ' f'beta: {model_vae.module.args.beta:.3f}')) # if global_step % 5 == 0: # row = { # 'PartitionKey': 'MILU_Rule_Rule_Template', # 'RowKey': str(datetime.now()), # 'ExpName' : args.ExpName, # 'iter': str( step + epoch*len(epoch_iterator) ), # 'loss': str( loss.item()), # 'loss_rec': str(loss_rec.item()), # 'loss_kl': str(loss_kl.item()), # 'beta': str(model_vae.args.beta) # } # # pdb.set_trace() # ts.insert_entity(table_name, row) # pdb.set_trace() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model_vae.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model_vae.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model_vae, encoder_tokenizer, decoder_tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save encoder model checkpoint output_encoder_dir = os.path.join( args.output_dir, 'checkpoint-encoder-{}'.format(global_step)) if not os.path.exists(output_encoder_dir): os.makedirs(output_encoder_dir) model_encoder_to_save = model_vae.module.encoder if hasattr( model_vae, 'module' ) else model_vae.encoder # Take care of distributed/parallel training if args.use_philly: save_solid = False while not save_solid: try: model_encoder_to_save.save_pretrained( output_encoder_dir) torch.save( args, os.path.join(output_encoder_dir, 'training_args.bin')) logger.info( "Saving model checkpoint to %s", output_encoder_dir) save_solid = True except: pass else: model_encoder_to_save.save_pretrained( output_encoder_dir) torch.save( args, os.path.join(output_encoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_encoder_dir) # Save decoder model checkpoint output_decoder_dir = os.path.join( args.output_dir, 'checkpoint-decoder-{}'.format(global_step)) if not os.path.exists(output_decoder_dir): os.makedirs(output_decoder_dir) model_decoder_to_save = model_vae.module.decoder if hasattr( model_vae, 'module' ) else model_vae.decoder # Take care of distributed/parallel training if args.use_philly: save_solid = False while not save_solid: try: model_decoder_to_save.save_pretrained( output_decoder_dir) torch.save( args, os.path.join(output_decoder_dir, 'training_args.bin')) logger.info( "Saving model checkpoint to %s", output_decoder_dir) save_solid = True except: pass else: model_decoder_to_save.save_pretrained( output_decoder_dir) torch.save( args, os.path.join(output_decoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_decoder_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break # print(dict_token_length) # with open('wikipedia_stats.json', 'w') as fp: # json.dump(dict_token_length, fp) if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): args = parse_arguments() # ====== Set random seed ========= random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # ======= Prepare ========== logging.basicConfig(level=logging.INFO) USE_CUDA = torch.cuda.is_available() FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor model, tokenizer = load_model(args) # =============== Load & process data ============== split_size = {'train': 0.85, 'test': 0.1, 'val': 0.05} data_loader, val_loader = get_data(args, split_size=split_size, tokenizer=tokenizer) # ========== Prepare optimizer ============= # the gpt2 model from library has unnamed LM head. LM head's weights are tied to input embedding num_train_optimization_steps = len( data_loader) * args.num_train_epochs // args.train_batch_size param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = construct_grouped_parameters( param_optimizer, args.learning_rate, use_discr=args.use_disc_lr) lm_funcs = get_unfreezing_funcs(optimizer_grouped_parameters, warmup_portion=args.warmup_proportion, total_steps=num_train_optimization_steps, use_unfreezing=args.use_unfreezing) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lm_funcs) # Training print("Start training.") model.train() exp_average_loss = None progress_bar = trange(int(args.num_train_epochs), desc="Epoch", leave=True) min_eval_loss = 100 # large enough number early_terminate_counter = 0 for _ in progress_bar: # for _ in range(int(args.num_train_epochs)): for sample in tqdm(data_loader): # for sample in data_loader: if args.keyword: x, type_x, pos_x, lm_x, x_len, _, keyword_x = sample else: x, type_x, pos_x, lm_x, x_len, _ = sample keyword_x = None input_len = x_len[0] lm_x[:, x_len[0] + 1 + args.first_K_tokens:-1] = -1 loss = model(x, position_ids=pos_x, token_type_ids=type_x, labels=lm_x, key_word=keyword_x, use_keyword=args.keyword)[0] loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) progress_bar.set_description( "Training loss: {}".format(exp_average_loss)) eval_loss = evaluate(model, val_loader, use_keyword=args.keyword) print("Eval loss: {}".format(eval_loss)) # if eval_loss < min_eval_loss: # save the model only when the loss is the smallest if True: early_terminate_counter = 0 min_eval_loss = eval_loss # ==== Save the model ==== # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_dir = '../models/' output_model_file = os.path.join(output_dir + args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_dir + args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(output_dir + args.output_dir) else: print("eval loss increasing!") early_terminate_counter += 1 if early_terminate_counter > 5: # if the eval loss does not decrease for 5 epochs, terminate early. return
# 优化器定义 param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6) model.train() best_f1 = 0. valid_best = np.zeros((valid_label.size(0), 2)) early_stop = 0 for epoch in range(num_epochs): train_loss = 0. for batch in tqdm(train_loader): batch = tuple(t.cuda() for t in batch) x_ids, x_mask, x_sids, y_truth = batch y_pred = model(x_ids, x_mask, x_sids) loss = loss_fn(y_pred, y_truth) optimizer.zero_grad() loss.backward()
def Train(inputIds, attention_masks, labels, batch_size=24, epochs=10): train_inputs, validation_inputs, train_labels, validation_labels = train_test_split( inputIds, labels, random_state=2020, test_size=0.2) train_masks, validation_masks, _, _ = train_test_split(attention_masks, inputIds, random_state=2020, test_size=0.2) # Turn data into torch tensors train_inputs = torch.tensor(train_inputs) validation_inputs = torch.tensor(validation_inputs) train_labels = torch.tensor(train_labels) validation_labels = torch.tensor(validation_labels) train_masks = torch.tensor(train_masks) validation_masks = torch.tensor(validation_masks) # Create Iterators of the datasets train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2) # Loads model into GPU memory model.cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5) # train_loss_set = [] # Find GPU or CPU device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') trainLoss = [] valAcc = [] for _ in trange(epochs, desc='Epoch'): # Train model.train() trainLoss.append(0) nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch optimizer.zero_grad() # Forward pass and loss calculation outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = outputs[0] logits = outputs[1] # Calculate gradients loss.backward() # Update weights using gradients optimizer.step() trainLoss[-1] += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print('\nTrain loss: {}'.format(trainLoss[-1] / nb_tr_steps)) # Valuation model.eval() nb_eval_steps = 0 valAcc.append(0) for batch in validation_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch # Don't calculate gradients since we are evaluating the model with torch.no_grad(): output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = output[0] # Grab logistic values from GPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() tmp_eval_accuracy = flat_accuracy(logits, label_ids) valAcc[-1] += tmp_eval_accuracy nb_eval_steps += 1 print('\nValidation Accuracy: {}\n'.format(valAcc[-1] / nb_eval_steps)) return model, trainLoss, valAcc
from dataloader import bAbi_Dataset import torch import torch.nn as nn from model import model from pytorch_transformers import AdamW device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): print("GPU:" + str(torch.cuda.get_device_name(0))) my_model = model() my_model.to(device) optimizer = AdamW(my_model.parameters()) criterion = nn.NLLLoss() EPOCHS = 10 for epoch in range(1, EPOCHS + 1): my_model.train() train_loss = 0 length = 0 for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader: output = my_model(tokens_tensor.to(device), segments_tensors.to(device), att_mask.to(device), pos_id.to(device)) loss = criterion(output, trg.to(device)) optimizer.zero_grad() loss.backward()
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader() num_train_optimization_steps = self.train_steps # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained(self.model_name_or_path,self.args, config=config) model.to(self.device) model.train() # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': self.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 best_MRR = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_domain,label_dependcy = batch loss_domain,loss_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, label_domain=label_domain, label_dependcy = label_dependcy ) loss = loss_domain+loss_dependcy tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels_domain = [] gold_labels_dependcy = [] inference_logits = [] scores_domain = [] scores_dependcy = [] ID = [x.guid for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) model.eval() eval_loss_domain,eval_loss_dependcy, eval_accuracy_domain,eval_accuracy_dependcy = 0,0,0,0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids,label_domain,label_dependcy in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_domain = label_domain.to(self.device) label_dependcy = label_dependcy.to(self.device) with torch.no_grad(): batch_eval_loss_domain,batch_eval_loss_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, label_domain=label_domain, label_dependcy=label_dependcy ) logits_domain,logits_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask ) logits_domain = logits_domain.view(-1, self.num_labels_domain).detach().cpu().numpy() logits_dependcy = logits_dependcy.view(-1, self.num_labels_dependcy).detach().cpu().numpy() label_domain = label_domain.view(-1).to('cpu').numpy() label_dependcy = label_dependcy.view(-1).to('cpu').numpy() scores_domain.append(logits_domain) scores_dependcy.append(logits_dependcy) gold_labels_domain.append(label_domain) gold_labels_dependcy.append(label_dependcy) eval_loss_domain += batch_eval_loss_domain.mean().item() eval_loss_dependcy += batch_eval_loss_dependcy.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels_domain = np.concatenate(gold_labels_domain, 0) gold_labels_dependcy = np.concatenate(gold_labels_dependcy, 0) scores_domain = np.concatenate(scores_domain, 0) scores_dependcy = np.concatenate(scores_dependcy, 0) model.train() eval_loss_domain = eval_loss_domain / nb_eval_steps eval_loss_dependcy = eval_loss_dependcy / nb_eval_steps eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain,mode='domain') eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy ,mode= 'dependcy') print( 'eval_F1_domain',eval_accuracy_domain, 'eval_F1_dependcy', eval_accuracy_dependcy, 'global_step',global_step, 'loss',train_loss ) result = {'eval_loss_domain': eval_loss_domain, 'eval_loss_dependcy':eval_loss_dependcy, 'eval_F1_domain': eval_accuracy_domain, 'eval_F1_dependcy': eval_accuracy_dependcy, 'global_step': global_step, 'loss': train_loss} output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy_domain > best_acc : print("=" * 80) print("Best F1", eval_accuracy_domain) print("Saving Model......") # best_acc = eval_accuracy best_acc = eval_accuracy_domain # Save a trained model model_to_save = model.module if hasattr(model,'module') else model output_model_file = os.path.join(self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def main(): parser = argparse.ArgumentParser() parser = add_xlmr_args(parser) parser.add_argument('--self_training', action='store_true', default=False) parser.add_argument('--unlabeled_data_dir', type=str, default='data/unlabeled_data') parser.add_argument('--self_training_confidence', type=float, default=0.9) parser.add_argument('--K', type=float, default=50) parser.add_argument('--patience', type=float, default=10) args = parser.parse_args() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) data_processor = SequenceLabelingProcessor(task=args.task_name) label_list = data_processor.get_labels() num_labels = len(label_list) + 1 # add one for IGNORE label train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = data_processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # preparing model configs hidden_size = 768 if 'base' in args.pretrained_path else 1024 # TODO: move this inside model.__init__ device = 'cuda' if (torch.cuda.is_available() and not args.no_cuda) else 'cpu' if args.use_crf: model_cls = XLMRForTokenClassificationWithCRF else: model_cls = XLMRForTokenClassification # creating model model = model_cls(pretrained_path=args.pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=args.dropout, device=device) model.to(device) if args.load_model is not None: logging.info("Loading saved model {}".format(args.load_model)) state_dict = torch.load(args.load_model) model.load_state_dict(state_dict, strict=True) no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) # freeze model if necessary if args.freeze_model: logger.info("Freezing XLM-R model...") for n, p in model.named_parameters(): if 'xlmr' in n and p.requires_grad: p.requires_grad = False if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = data_processor.convert_examples_to_features( train_examples, label_list, args.max_seq_length, model.encode_word) if args.self_training: self_training_examples = data_processor.get_unlabeled_examples( args.unlabeled_data_dir) self_training_features = data_processor.convert_examples_to_features( self_training_examples, label_list, args.max_seq_length, model.encode_word) logging.info("Loaded {} Unlabeled examples".format( len(self_training_examples))) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = create_ner_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) val_examples = data_processor.get_dev_examples(args.data_dir) val_features = data_processor.convert_examples_to_features( val_examples, label_list, args.max_seq_length, model.encode_word) val_data = create_ner_dataset(val_features) best_val_f1 = 0.0 ############################# Self Training Loop ###################### n_iter = 0 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) patience = 0 while 1: ############################ Inner Training Loop ##################### #if n_iter >= 50: # break # reset lr n_iter += 1 print(len(train_dataloader)) loss_fct = nn.BCELoss() for epoch_ in tqdm(range(args.num_train_epochs), desc="Epoch", disable=args.no_pbar): tr_loss = 0 tbar = tqdm(train_dataloader, desc="Iteration", disable=args.no_pbar) model.train() for step, batch in enumerate(tbar): batch = tuple(t.to(device) for t in batch) input_ids, label_ids, l_mask, valid_ids, = batch loss, _ = model(input_ids, label_ids, l_mask, valid_ids, get_sent_repr=True) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() tbar.set_description('Loss = %.4f' % (tr_loss / (step + 1))) logger.info("Evaluating on validation set...\n") #torch.save(model.state_dict(), open(os.path.join(args.output_dir, 'model.pt'), 'wb')) f1, report = evaluate_model_seq_labeling( model, val_data, label_list, args.eval_batch_size, args.use_crf, device) if f1 > best_val_f1: best_val_f1 = f1 logger.info( "\nFound better f1=%.4f on validation set. Saving model\n" % (f1)) logger.info("\n%s\n" % (report)) torch.save( model.state_dict(), open(os.path.join(args.output_dir, 'model.pt'), 'wb')) patience = 0 else: logger.info("\nNo better F1 score: {}\n".format(f1)) patience += 1 ###################################################################### if not args.self_training: break if patience >= args.patience: logger.info("No more patience. Existing") break ## get confidence and update train_data, train_dataloader # convert unlabeled examples to features if len(self_training_features) <= 0: # no more self-training data break confident_features, self_training_features = get_top_confidence_samples_seq_labeling( model, self_training_features, batch_size=args.eval_batch_size, K=args.K) for f in confident_features: l_ids = f.label_id l_s = [label_map[i] for i in l_ids] logging.info("Got %d confident samples" % (len(confident_features))) # append new features #train_features = data_processor.convert_examples_to_features( # train_examples, label_list, args.max_seq_length, model.encode_word) train_features.extend(confident_features) print("now we have %d total examples" % len(train_features)) train_data = create_ner_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for g in optimizer.param_groups: g['lr'] = args.learning_rate scheduler.step(0) #print("Loading best last model...") #model.load_state_dict(torch.load(open(os.path.join(args.output_dir, 'model.pt'), 'rb'))) # load best/ saved model state_dict = torch.load( open(os.path.join(args.output_dir, 'model.pt'), 'rb')) model.load_state_dict(state_dict) logger.info("Loaded saved model") model.to(device) if args.do_eval: if args.eval_on == "dev": eval_examples = data_processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = data_processor.get_test_examples(args.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = data_processor.convert_examples_to_features( eval_examples, label_list, args.max_seq_length, model.encode_word) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_data = create_ner_dataset(eval_features) f1_score, report = evaluate_model_seq_labeling(model, eval_data, label_list, args.eval_batch_size, args.use_crf, device) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") logger.info("dataset = {}".format(args.data_dir)) logger.info("model = {}".format(args.output_dir)) with open(output_eval_file, "w") as writer: logger.info("***** Writing results to file *****") writer.write(report) logger.info("Done.")
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False) test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False) model = NeuralNet(model_name_or_path) model.cuda() loss_fn = torch.nn.CrossEntropyLoss() # 优化器定义 param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6) model.train() best_f1 = 0. valid_best = np.zeros((valid_label.size(0), 2)) early_stop = 0 # for epoch in range(num_epochs): # train_loss = 0. # for batch in tqdm(train_loader): # batch = tuple(t.cuda() for t in batch) # x_ids, x_mask, x_sids, y_truth = batch # y_pred = model(x_ids, x_mask, x_sids) # loss = loss_fn(y_pred, y_truth) # optimizer.zero_grad() # loss.backward()
def train(self, dataloader: DataLoader, train_config: TrainConfig): """ Train the model with the given data and config :param dataloader: the data for the training :param train_config: the configuration for the training """ if train_config.output_path is not None: os.makedirs(train_config.output_path, exist_ok=True) if os.listdir(train_config.output_path): raise ValueError("Output directory ({}) already exists and is not empty.".format( train_config.output_path)) self.save(train_config.output_path, save_config=True, save_model=False) self.best_score = -9999 num_train_steps = int(len(dataloader) / train_config.gradient_accumulation_steps * train_config.epochs) # Prepare optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': train_config.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if train_config.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = AdamW(optimizer_grouped_parameters, lr=train_config.learning_rate, eps=train_config.adam_epsilon, correct_bias=train_config.correct_bias) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_config.warmup_steps, t_total=t_total) if train_config.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(self.model, optimizer, opt_level=train_config.fp16_opt_level) global_step = 0 for epoch in trange(train_config.epochs, desc="Epoch"): training_steps = 0 self.model.train() for step, batch in enumerate(tqdm(dataloader, desc="Iteration")): batch = batch_to_device(batch, self.device) input_ids, segment_ids, input_masks, label_ids = batch loss = self.model(input_ids, segment_ids, input_masks, label_ids) if train_config.gradient_accumulation_steps > 1: loss = loss / train_config.gradient_accumulation_steps if train_config.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), train_config.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), train_config.max_grad_norm) training_steps += 1 if (step + 1) % train_config.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if train_config.evaluation_steps > 0 and training_steps % train_config.evaluation_steps == 0: self._eval_during_training(train_config, epoch, training_steps) self.model.train() self._eval_during_training(train_config, epoch, -1)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps //\ (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader)\ // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, scheduler.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids, lm_labels, mc_labels) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))