def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_dir", default='out/', type=str, required=True, help="Dir of the trained Ner model.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) # Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=8, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # n_gpu = torch.cuda.device_count() n_gpu = 1 # in case tensor in different gpu else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = RelationProcessor() train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) print("num_train_optimization_steps", num_train_optimization_steps) n_classes = len(relation_list) # num of relations transitional_size = 100 # size of transitional layer model = relation_extracter('out/', transitional_size, n_classes) tokenizer = model.tokeniser max_seq_length = model.max_seq_length model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in ['bert'])], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, relation_list, max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_relation_matrices = torch.tensor( [f.relation_matrix for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_relation_matrices) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, relarion_matrices = batch relarion_matrices = relarion_matrices.float() loss = model(input_ids, segment_ids, input_mask, label_ids, relarion_matrices)["loss"] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration # model_to_save = model.module if hasattr( # model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, 'ner_re.bin') torch.save(model.state_dict(), output_model_file) # Load a trained model and config that you have fine-tuned else: output_model_file = os.path.join(args.output_dir, 'ner_re.bin') model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, relation_list, max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_relation_matrices = torch.tensor( [f.relation_matrix for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_relation_matrices) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids, input_mask, segment_ids, label_ids, relarion_matrices in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) relarion_matrices = relarion_matrices.float() relarion_matrices = relarion_matrices.to(device) with torch.no_grad(): output_dict = model(input_ids, segment_ids, input_mask, label_ids, relarion_matrices) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", model.get_metrics()) writer.write(str(model.get_metrics()))
class BertTrainer: def __init__(self, hypers: Hypers, model_name, checkpoint, **extra_model_args): """ initialize the BertOptimizer, with common logic for setting weight_decay_rate, doing gradient accumulation and tracking loss :param hypers: the core hyperparameters for the bert model :param model_name: the fully qualified name of the bert model we will train like pytorch_pretrained_bert.modeling.BertForQuestionAnswering :param checkpoint: if resuming training, this is the checkpoint that contains the optimizer state as checkpoint['optimizer'] """ self.init_time = time.time() self.model = self.get_model(hypers, model_name, checkpoint, **extra_model_args) self.step = 0 self.hypers = hypers self.train_stats = TrainStats(hypers) self.model.train() logger.info('configured model for training') # show parameter names # logger.info(str([n for (n, p) in self.model.named_parameters()])) # Prepare optimizer if hasattr(hypers, 'exclude_pooler') and hypers.exclude_pooler: # module.bert.pooler.dense.weight, module.bert.pooler.dense.bias # see https://github.com/NVIDIA/apex/issues/131 self.param_optimizer = [ (n, p) for (n, p) in self.model.named_parameters() if '.pooler.' not in n ] else: self.param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in self.param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] self.t_total = hypers.num_train_steps self.global_step = hypers.global_step if hypers.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=hypers.learning_rate, bias_correction=False, max_grad_norm=1.0) if hypers.loss_scale == 0: self.optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=True, verbose=(hypers.global_rank == 0)) else: self.optimizer = FP16_Optimizer( optimizer, static_loss_scale=hypers.loss_scale, verbose=(hypers.global_rank == 0)) else: self.optimizer = BertAdam(optimizer_grouped_parameters, lr=hypers.learning_rate, warmup=hypers.warmup_proportion, t_total=self.t_total) logger.info('created optimizer') if checkpoint and type( checkpoint) is dict and 'optimizer' in checkpoint: self.optimizer.load_state_dict(checkpoint['optimizer']) if hypers.fp16: pass else: # if we load this state, we need to set the t_total to what we passed, not what was saved self.optimizer.set_t_total(self.t_total) # show state of optimizer lrs = self.optimizer.get_lr() logger.info('Min and max learn rate: %s', str([min(lrs), max(lrs)])) logger.info('Min and max step in state: %s', str(self.optimizer.get_steps())) instances_per_step = hypers.train_batch_size * hypers.gradient_accumulation_steps * hypers.world_size if 'seen_instances' in checkpoint: self.global_step = int(checkpoint['seen_instances'] / instances_per_step) self.train_stats.previous_instances = checkpoint[ 'seen_instances'] logger.info('got global step from checkpoint = %i', self.global_step) logger.info('Loaded optimizer state:') logger.info(repr(self.optimizer)) def reset(self): """ reset any gradient accumulation :return: """ self.model.zero_grad() self.step = 0 def should_continue(self): """ :return: True if training should continue """ if self.global_step >= self.t_total: logger.info( 'stopping due to train step %i >= target train steps %i', self.global_step, self.t_total) return False if 0 < self.hypers.time_limit <= (time.time() - self.init_time): logger.info('stopping due to time out %i seconds', self.hypers.time_limit) return False return True def save_simple(self, filename): if self.hypers.global_rank != 0: logger.info('skipping save in %i', torch.distributed.get_rank()) return model_to_save = self.model.module if hasattr( self.model, 'module') else self.model # Only save the model itself torch.save(model_to_save.state_dict(), filename) logger.info(f'saved model only to {filename}') def save(self, filename, **extra_checkpoint_info): """ save a checkpoint with the model parameters, the optimizer state and any additional checkpoint info :param filename: :param extra_checkpoint_info: :return: """ # only local_rank 0, in fact only global rank 0 if self.hypers.global_rank != 0: logger.info('skipping save in %i', torch.distributed.get_rank()) return start_time = time.time() checkpoint = extra_checkpoint_info model_to_save = self.model.module if hasattr( self.model, 'module') else self.model # Only save the model itself os.makedirs(os.path.dirname(filename), exist_ok=True) # also save the optimizer state, since we will likely resume from partial pre-training checkpoint['state_dict'] = model_to_save.state_dict() checkpoint['optimizer'] = self.optimizer.state_dict() # include world size in instances_per_step calculation instances_per_step = self.hypers.train_batch_size * \ self.hypers.gradient_accumulation_steps * \ self.hypers.world_size checkpoint['seen_instances'] = self.global_step * instances_per_step checkpoint['num_instances'] = self.t_total * instances_per_step # CONSIDER: also save hypers? torch.save(checkpoint, filename) logger.info( f'saved model to {filename} in {time.time()-start_time} seconds') def get_instance_count(self): instances_per_step = self.hypers.train_batch_size * \ self.hypers.gradient_accumulation_steps * \ self.hypers.world_size return self.global_step * instances_per_step def step_loss(self, loss): """ accumulates the gradient, tracks the loss and applies the gradient to the model :param loss: the loss from evaluating the model """ if self.global_step == 0: logger.info('first step_loss') if self.hypers.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. self.train_stats.note_loss(loss.item()) if self.hypers.gradient_accumulation_steps > 1: loss = loss / self.hypers.gradient_accumulation_steps if self.hypers.fp16: self.optimizer.backward(loss) else: loss.backward() if (self.step + 1) % self.hypers.gradient_accumulation_steps == 0: lr_this_step = self.hypers.learning_rate * warmup_linear( self.global_step / self.t_total, self.hypers.warmup_proportion) for param_group in self.optimizer.param_groups: param_group['lr'] = lr_this_step self.optimizer.step() self.model.zero_grad() self.global_step += 1 self.step += 1 @classmethod def get_files(cls, train_file, completed_files): logger.info('completed files = %s, count = %i', str(completed_files[:min(5, len(completed_files))]), len(completed_files)) # multiple train files if not os.path.isdir(train_file): train_files = [train_file] else: if not train_file.endswith('/'): train_file = train_file + '/' train_files = glob.glob(train_file + '**', recursive=True) train_files = [f for f in train_files if not os.path.isdir(f)] # exclude completed files if not set(train_files) == set(completed_files): train_files = [f for f in train_files if f not in completed_files] else: completed_files = [] # new epoch logger.info('train files = %s, count = %i', str(train_files[:min(5, len(train_files))]), len(train_files)) return train_files, completed_files @classmethod def get_model(cls, hypers, model_name, checkpoint, **extra_model_args): override_state_dict = None if checkpoint: if type(checkpoint) is dict and 'state_dict' in checkpoint: logger.info('loading from multi-part checkpoint') override_state_dict = checkpoint['state_dict'] else: logger.info('loading from saved model parameters') override_state_dict = checkpoint # create the model object by name # https://stackoverflow.com/questions/4821104/python-dynamic-instantiation-from-string-name-of-a-class-in-dynamically-imported import importlib clsdot = model_name.rfind('.') class_ = getattr(importlib.import_module(model_name[0:clsdot]), model_name[clsdot + 1:]) model_args = { 'state_dict': override_state_dict, 'cache_dir': PYTORCH_PRETRAINED_BERT_CACHE } model_args.update(extra_model_args) # logger.info(pprint.pformat(extra_model_args, indent=4)) model = class_.from_pretrained(hypers.bert_model, **model_args) logger.info('built model') # configure model for fp16, multi-gpu and/or distributed training if hypers.fp16: model.half() logger.info('model halved') logger.info('sending model to %s', str(hypers.device)) model.to(hypers.device) logger.info('sent model to %s', str(hypers.device)) if hypers.local_rank != -1: if not hypers.no_apex: try: from apex.parallel import DistributedDataParallel as DDP model = DDP(model) except ImportError: raise ImportError("Please install apex") else: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[hypers.local_rank], output_device=hypers.local_rank) logger.info('using DistributedDataParallel for world size %i', hypers.world_size) elif hypers.n_gpu > 1: model = torch.nn.DataParallel(model) return model @classmethod def get_base_parser(cls): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help= "Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) # Other parameters parser.add_argument( "--num_instances", default=-1, type=int, help="Total number of training instances to train over.") parser.add_argument( "--seen_instances", default=-1, type=int, help= "When resuming training, the number of instances we have already trained over." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--no_apex", default=False, action='store_true', help="Whether not to use apex when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence. ' 'Leave at zero to use dynamic loss scaling') return parser
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--task", default=None, type=str, required=True, help="Sentiment analysis or natural language inference? (SA or NLI)") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--trained_model_dir", default="", type=str, help= "Where is the fine-tuned (with the cloze-style LM objective) BERT model?" ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run eval on the test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--freeze_bert', action='store_true', help="Whether to freeze BERT") parser.add_argument('--full_bert', action='store_true', help="Whether to use full BERT") parser.add_argument('--num_train_samples', type=int, default=-1, help="-1 for full train set, otherwise please specify") args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` or `do_test` must be True." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: logger.info( "WARNING: Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Prepare data processor mnli_processor = MnliProcessor() hans_processor = HansProcessor() sst_processor = Sst2Processor() if args.task == "SA": label_list = sst_processor.get_labels() elif args.task == "NLI": label_list = mnli_processor.get_labels() else: raise ValueError("") num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare training data train_examples = None num_train_optimization_steps = None if args.do_train: if args.task == "SA": train_examples = sst_processor.get_train_examples( args.data_dir, args.num_train_samples) elif args.task == "NLI": train_examples = mnli_processor.get_train_examples( args.data_dir, args.num_train_samples) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size) * args.num_train_epochs # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(-1)) if args.trained_model_dir: # load in fine-tuned (with cloze-style LM objective) model if os.path.exists(os.path.join(args.output_dir, WEIGHTS_NAME)): previous_state_dict = torch.load( os.path.join(args.output_dir, WEIGHTS_NAME)) else: from collections import OrderedDict previous_state_dict = OrderedDict() distant_state_dict = torch.load( os.path.join(args.trained_model_dir, WEIGHTS_NAME)) previous_state_dict.update( distant_state_dict ) # note that the final layers of previous model and distant model must have different attribute names! model = MyBertForSequenceClassification.from_pretrained( args.trained_model_dir, state_dict=previous_state_dict, num_labels=num_labels) else: model = MyBertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) if args.freeze_bert: # freeze BERT if needed frozen = ['bert'] elif args.full_bert: frozen = [] else: frozen = [ 'bert.embeddings.', 'bert.encoder.layer.0.', 'bert.encoder.layer.1.', 'bert.encoder.layer.2.', 'bert.encoder.layer.3.', 'bert.encoder.layer.4.', 'bert.encoder.layer.5.', 'bert.encoder.layer.6.', 'bert.encoder.layer.7.', ] # *** change here to filter out params we don't want to track *** no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if (not any(fr in n for fr in frozen)) and (not any(nd in n for nd in no_decay)) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if (not any(fr in n for fr in frozen)) and (any(nd in n for nd in no_decay)) ], 'weight_decay': 0.0 }] if args.fp16: raise ValueError("Not sure if FP16 precision works yet.") else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.do_train: global_step = 0 train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_id = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_id) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() # model.eval() # train in eval mode to avoid dropout for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch_loss = [] for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16: optimizer.backward(loss) else: loss.backward() if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 epoch_loss.append(loss.item()) logger.info(" epoch loss = %f", np.mean(epoch_loss)) if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) if args.do_test: if args.task == "SA": test_examples = sst_processor.get_dev_examples(args.data_dir) elif args.task == "NLI": test_examples = mnli_processor.get_dev_examples(args.data_dir) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running final test *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_id = torch.tensor([f.label_id for f in test_features], dtype=torch.long) all_guid = torch.tensor([f.guid for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_id, all_guid) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 wrong_list = [] for input_ids, input_mask, segment_ids, label_ids, guids in tqdm( test_dataloader, desc="Testing"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_test_correct, tmp_test_total = accuracy(logits, label_ids) assert tmp_test_total == 1 if tmp_test_correct == 0: wrong_list.append(guids[0].item()) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_correct nb_test_examples += tmp_test_total nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples result = {'test_loss': test_loss, 'test_accuracy': test_accuracy} output_test_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--flag', type=str, default='', help="Can be rename the stored BERT") parser.add_argument("--load_own_model", action='store_true', help="load_own_model.") args = parser.parse_args() # if args.server_ip and args.server_port: # # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # import ptvsd # print("Waiting for debugger attach") # ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) # ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() #[0,1] num_labels = len(label_list) eval_examples = processor.get_test_examples_wenpeng( '/home/wyin3/Datasets/RTE/test_RTE_1235.txt') # inter_task_names = ['SICK','SciTail', 'FEVER','MNLI','GLUE-RTE'] inter_task_names = ['SciTail', 'GLUE-RTE'] train_examples_sequence = processor.get_sequence_train_examples_wenpeng( '/home/wyin3/Datasets/MNLI-SNLI-SciTail-RTE-SICK/all.6.train.txt', inter_task_names) max_test_acc = 0.0 for name_id, train_examples in enumerate(train_examples_sequence): print('starting training ........', inter_task_names[name_id], '.....size:', len(train_examples)) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) if name_id > 0: output_dir_new = args.output_dir + '/' + args.flag model = BertForSequenceClassification.from_pretrained( output_dir_new, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( output_dir_new, do_lower_case=args.do_lower_case) print('\t\t\tload fine-tuned model succeed.........') else: model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) print('\t\t\tload pretrained model succeed.........') if args.fp16: model.half() model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = None if args.do_train: num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) iter_co = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 iter_co += 1 ''' evaluate after each epoch ''' model.eval() eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None test_acc = result.get("acc") if test_acc > max_test_acc: max_test_acc = test_acc ''' store the model ''' store_bert_model(model, tokenizer.vocab, args.output_dir, args.flag) print('test acc:', test_acc, ' max_test_acc:', max_test_acc)
def train(processor, parameters): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, False, parameters['FP16'])) TRAIN_BATCH_SIZE_ = parameters['TRAIN_BATCH_SIZE'] // parameters[ 'GRADIENT_ACCUMULATION_STEPS'] random.seed(parameters['SEED']) np.random.seed(parameters['SEED']) torch.manual_seed(parameters['SEED']) if n_gpu > 0: torch.cuda.manual_seed_all(parameters['SEED']) if os.path.exists(parameters['OUTPUT_DIR']) and os.listdir( parameters['OUTPUT_DIR']) and parameters['DO_TRAIN']: raise ValueError( "Output directory ({}) already exists and is not empty.".format( parameters['OUTPUT_DIR'])) if not os.path.exists(parameters['OUTPUT_DIR']): os.makedirs(parameters['OUTPUT_DIR']) task_name = parameters['TASK_NAME'].lower() output_mode = 'classification' label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained( parameters['BERT_MODEL'], do_lower_case=parameters['DO_LOWER_CASE']) train_examples = None num_train_optimization_steps = None if parameters['DO_TRAIN']: train_examples = processor.get_train_examples() num_train_optimization_steps = int( len(train_examples) / TRAIN_BATCH_SIZE_ / parameters['GRADIENT_ACCUMULATION_STEPS'] ) * parameters['NUM_TRAIN_EPOCHS'] # Prepare model model = BertForSequenceClassification.from_pretrained( parameters['BERT_MODEL'], cache_dir=parameters['CACHE_DIR'], num_labels=num_labels) model.to(device) model = torch.nn.DataParallel(model) # Prepare optimizer if parameters['DO_TRAIN']: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=parameters['LEARNING_RATE'], warmup=parameters['WARMUP_PROPORTION'], t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if parameters['DO_TRAIN']: train_features = convert_examples_to_features( train_examples, label_list, parameters['MAX_SEQ_LENGTH'], tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", TRAIN_BATCH_SIZE_) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if parameters['LOCAL_RANK'] == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE_) model.train() for _ in trange(int(parameters['NUM_TRAIN_EPOCHS']), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if parameters['GRADIENT_ACCUMULATION_STEPS'] > 1: loss = loss / parameters['GRADIENT_ACCUMULATION_STEPS'] if parameters['FP16']: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % parameters['GRADIENT_ACCUMULATION_STEPS'] == 0: if parameters['FP16']: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = parameters[ 'LEARNING_RATE'] * warmup_linear.get_lr( global_step, parameters['WARMUP_PROPORTION']) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(parameters['OUTPUT_DIR'], parameters['WEIGHTS_NAME']) output_config_file = os.path.join(parameters['OUTPUT_DIR'], parameters['CONFIG_NAME']) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(parameters['OUTPUT_DIR']) logging.info(tr_loss) else: # Load a trained model and vocabulary that you have fine-tuned logging.info("Getting from pretrained") model = BertForSequenceClassification.from_pretrained( parameters['OUTPUT_DIR'], num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( parameters['OUTPUT_DIR'], do_lower_case=parameters['DO_LOWER_CASE']) model.to(device) return model, tokenizer
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model_src", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--bert_model_mt", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--fc_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir_src", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--output_dir_mt", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--output_dir_fc", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--log_path', type=str, default="./log", help="The path for saving tensorboard logs. Default is ./log") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir_src) and os.listdir(args.output_dir_src): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir_src)) os.makedirs(args.output_dir_src, exist_ok=True) if os.path.exists(args.output_dir_mt) and os.listdir(args.output_dir_mt): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir_mt)) os.makedirs(args.output_dir_mt, exist_ok=True) if os.path.exists(args.output_dir_fc) and os.listdir(args.output_dir_fc): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir_mt)) os.makedirs(args.output_dir_fc, exist_ok=True) processors = { "qe": MyProcessor } task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() model_collections = Collections() # Prepare model tokenizer_src = BertTokenizer.from_pretrained(args.bert_model_src, do_lower_case=args.do_lower_case) tokenizer_mt = BertTokenizer.from_pretrained(args.bert_model_mt, do_lower_case=args.do_lower_case) # model_src = BertModel.from_pretrained(args.bert_model_src) # model_mt = BertModel.from_pretrained(args.bert_model_mt) # model_src.to(device) # model_mt.to(device) # # load config # # src.config==mt.config # config_file = os.path.join(args.bert_model_src, CONFIG_NAME) # config = BertConfig.from_json_file(config_file) # # fnn # full_connect = torch.nn.Linear(2 * config.hidden_size, 1) # torch.nn.init.xavier_normal_(full_connect.weight) # full_connect.to(device) # fine-tuning fine-tuing model # Load a trained model and config that you have fine-tuned output_config_file_src = os.path.join(args.bert_model_src, CONFIG_NAME) config_src = BertConfig(output_config_file_src) model_src = BertModel(config_src) output_model_file_src = os.path.join(args.bert_model_src, WEIGHTS_NAME) model_state_dict_src = torch.load(output_model_file_src) model_src.load_state_dict(model_state_dict_src) # Load a trained model and config that you have fine-tuned output_config_file_mt = os.path.join(args.bert_model_mt, CONFIG_NAME) config_mt = BertConfig(output_config_file_mt) model_mt = BertModel(config_mt) output_model_file_mt = os.path.join(args.bert_model_mt, WEIGHTS_NAME) model_state_dict_mt = torch.load(output_model_file_mt) model_mt.load_state_dict(model_state_dict_mt) model_src.to(device) model_mt.to(device) full_connect = torch.nn.Linear(2 * config_src.hidden_size, 1) model_state_dict_fc = torch.load(args.fc_model) full_connect.load_state_dict(model_state_dict_fc) full_connect.to(device) #--------------------------------------------- # # dropout dropout = torch.nn.Dropout(config_src.hidden_dropout_prob) # sigmoid sigmoid = torch.nn.Sigmoid() # loss loss_fct = torch.nn.MSELoss() # ---------------------------------------------------------------------------------------------# train_examples=None num_train_steps=None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare optimizer param_optimizer = list(model_src.named_parameters(prefix='src')) + list(model_mt.named_parameters(prefix='mt')) \ + list(full_connect.named_parameters()) # param_optimizer = list(full_connect.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) # optimizer.zero_grad() global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, args.max_seq_length, tokenizer_src, tokenizer_mt) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids_src = torch.tensor([f.input_ids_src for f in train_features], dtype=torch.long) all_input_mask_src = torch.tensor([f.input_mask_src for f in train_features], dtype=torch.long) all_segment_ids_src = torch.tensor([f.segment_ids_src for f in train_features], dtype=torch.long) all_input_ids_mt = torch.tensor([f.input_ids_mt for f in train_features], dtype=torch.long) all_input_mask_mt = torch.tensor([f.input_mask_mt for f in train_features], dtype=torch.long) all_segment_ids_mt = torch.tensor([f.segment_ids_mt for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids_src, all_input_mask_src, all_segment_ids_src, all_input_ids_mt, all_input_mask_mt, all_segment_ids_mt, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # Timer for computing speed timer_for_speed = Timer() timer_for_speed.tic() summary_writer = SummaryWriter(log_dir=args.log_path) is_early_stop = False disp_freq = 100 loss_valid_freq = 100 early_stop_patience = 10 bad_count = 0 nb_tr_examples, nb_tr_steps = 0, 0 for eidx in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): # optimizer.zero_grad() try: model_src.train() model_mt.train() full_connect.train() # model_src.eval() # model_mt.eval() # full_connect.train() batch = tuple(t.to(device) for t in batch) input_ids_src, input_mask_src, segment_ids_src, input_ids_mt, \ input_mask_mt, segment_ids_mt, label_ids = batch with torch.enable_grad(): _, pooled_output_src = model_src(input_ids_src, segment_ids_src, input_mask_src, output_all_encoded_layers=False) # with torch.no_grad(): pooled_output_src = dropout(pooled_output_src) _, pooled_output_mt = model_mt(input_ids_mt, segment_ids_mt, input_mask_mt, output_all_encoded_layers=False) pooled_output_mt = dropout(pooled_output_mt) # pooled_output_mt = dropout(pooled_output_mt) # pooled_output [batch_size,2*hidden_size] pooled_output = torch.cat((pooled_output_src, pooled_output_mt), 1) logits = sigmoid(full_connect(pooled_output)) loss = loss_fct(logits.view(-1), label_ids.view(-1)) # with torch.no_grad(): # _, pooled_output_src = model_src(input_ids_src, segment_ids_src, input_mask_src, # output_all_encoded_layers=False) # # # pooled_output_src = dropout(pooled_output_src) # _, pooled_output_mt = model_mt(input_ids_mt, segment_ids_mt, input_mask_mt, # output_all_encoded_layers=False) # # pooled_output_mt = dropout(pooled_output_mt) # # # pooled_output_mt = dropout(pooled_output_mt) # # pooled_output [batch_size,2*hidden_size] # pooled_output = torch.cat((pooled_output_src, pooled_output_mt), 1) # # logits = sigmoid(full_connect(pooled_output.detach())) # loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() # tr_loss += loss.item() nb_tr_examples += input_ids_src.size(0) nb_tr_steps += 1 # optimizer.step() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # display some information if (nb_tr_steps % disp_freq == 0): model_collections.add_to_collection("train_losses", loss.item()) summary_writer.add_scalar("train_losses", loss.item(), global_step=nb_tr_steps) lrate = args.learning_rate * warmup_linear( nb_tr_steps / t_total, args.warmup_proportion) result = {'train_loss': loss.item(), 'lrate': lrate} logger.info("***** train results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') # optimizer.zero_grad() if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: raise e # calculate dev loss if (nb_tr_steps % loss_valid_freq == 0): if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, args.max_seq_length, tokenizer_src, tokenizer_mt) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids_src = torch.tensor([f.input_ids_src for f in eval_features], dtype=torch.long) all_input_mask_src = torch.tensor([f.input_mask_src for f in eval_features], dtype=torch.long) all_segment_ids_src = torch.tensor([f.segment_ids_src for f in eval_features], dtype=torch.long) all_input_ids_mt = torch.tensor([f.input_ids_mt for f in eval_features], dtype=torch.long) all_input_mask_mt = torch.tensor([f.input_mask_mt for f in eval_features], dtype=torch.long) all_segment_ids_mt = torch.tensor([f.segment_ids_mt for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids_src, all_input_mask_src, all_segment_ids_src, all_input_ids_mt, all_input_mask_mt, all_segment_ids_mt, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model_src.eval() model_mt.eval() full_connect.eval() eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch_eval in eval_dataloader: batch_eval = tuple(t.to(device) for t in batch_eval) input_ids_src, input_mask_src, segment_ids_src, input_ids_mt, \ input_mask_mt, segment_ids_mt, label_ids=batch_eval with torch.no_grad(): _, pooled_output_src = model_src(input_ids_src, segment_ids_src, input_mask_src, output_all_encoded_layers=False) _, pooled_output_mt = model_mt(input_ids_mt, segment_ids_mt, input_mask_mt, output_all_encoded_layers=False) # pooled_output [batch_size,2*hidden_size] pooled_output = torch.cat((pooled_output_src, pooled_output_mt), 1) logits = sigmoid(full_connect(pooled_output.detach())) tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids_src.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps model_collections.add_to_collection("history_losses", eval_loss) min_history_loss = np.array(model_collections.get_collection("history_losses")).min() summary_writer.add_scalar("loss", eval_loss, global_step=nb_tr_steps) summary_writer.add_scalar("best_loss", min_history_loss, global_step=nb_tr_steps) lrate = args.learning_rate * warmup_linear( nb_tr_steps / t_total, args.warmup_proportion) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=nb_tr_steps) best_eval_loss = min_history_loss # If model get new best valid loss # save model & early stop if eval_loss <= best_eval_loss: bad_count = 0 if is_early_stop is False: # Save a trained model # Only save the model it-self # # Save a trained model and the associated configuration model_to_save_src = model_src.module if hasattr(model_src, 'module') else model_src output_model_file_src = os.path.join(args.output_dir_src, WEIGHTS_NAME) torch.save(model_to_save_src.state_dict(), output_model_file_src) output_config_file_src = os.path.join(args.output_dir_src, CONFIG_NAME) with open(output_config_file_src, 'w') as f: f.write(model_to_save_src.config.to_json_string()) model_to_save_mt = model_mt.module if hasattr(model_mt, 'module') else model_mt output_model_file_mt = os.path.join(args.output_dir_mt, WEIGHTS_NAME) torch.save(model_to_save_mt.state_dict(), output_model_file_mt) output_config_file_mt = os.path.join(args.output_dir_mt, CONFIG_NAME) with open(output_config_file_mt, 'w') as f: f.write(model_to_save_mt.config.to_json_string()) output_model_file_fc = os.path.join(args.output_dir_fc, "fnn.best." + str(nb_tr_steps)) torch.save(full_connect.state_dict(), output_model_file_fc) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= early_stop_patience and eidx > 0: is_early_stop = True logger.info("Early Stop!") summary_writer.add_scalar("bad_count", bad_count, nb_tr_steps) logger.info("{0} Loss: {1:.4f} patience: {2}".format( nb_tr_steps, eval_loss, bad_count)) if is_early_stop == True: break
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_dir = os.path.join(args.data_dir, 'train') train_examples = read_race_examples( [train_dir + '/high', train_dir + '/middle']) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_choices=4) if args.fp16: model.half() model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in range(int(args.num_train_epochs)): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("Trianing Epoch: {}/{}".format( ep + 1, int(args.num_train_epochs))) for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % 100 == 0: logger.info("Training loss: {}, global step: {}".format( tr_loss / nb_tr_steps, global_step)) ## evaluate on dev set if global_step % 1000 == 0: dev_dir = os.path.join(args.data_dir, 'dev') dev_set = [dev_dir + '/high', dev_dir + '/middle'] eval_examples = read_race_examples(dev_set) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: Dev *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'dev_eval_loss': eval_loss, 'dev_eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Dev results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_dir = os.path.join(args.data_dir, 'test') test_high = [test_dir + '/high'] test_middle = [test_dir + '/middle'] ## test high eval_examples = read_race_examples(test_high) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test high *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() high_eval_loss, high_eval_accuracy = 0, 0 high_nb_eval_steps, high_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) high_eval_loss += tmp_eval_loss.mean().item() high_eval_accuracy += tmp_eval_accuracy high_nb_eval_examples += input_ids.size(0) high_nb_eval_steps += 1 eval_loss = high_eval_loss / high_nb_eval_steps eval_accuracy = high_eval_accuracy / high_nb_eval_examples result = { 'high_eval_loss': eval_loss, 'high_eval_accuracy': eval_accuracy } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## test middle eval_examples = read_race_examples(test_middle) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test middle *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() middle_eval_loss, middle_eval_accuracy = 0, 0 middle_nb_eval_steps, middle_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) middle_eval_loss += tmp_eval_loss.mean().item() middle_eval_accuracy += tmp_eval_accuracy middle_nb_eval_examples += input_ids.size(0) middle_nb_eval_steps += 1 eval_loss = middle_eval_loss / middle_nb_eval_steps eval_accuracy = middle_eval_accuracy / middle_nb_eval_examples result = { 'middle_eval_loss': eval_loss, 'middle_eval_accuracy': eval_accuracy } with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## all test eval_loss = (middle_eval_loss + high_eval_loss) / ( middle_nb_eval_steps + high_nb_eval_steps) eval_accuracy = (middle_eval_accuracy + high_eval_accuracy) / ( middle_nb_eval_examples + high_nb_eval_examples) result = { 'overall_eval_loss': eval_loss, 'overall_eval_accuracy': eval_accuracy } with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
class Trainer(object): def __init__(self, args): self.args = args def save_model(self, model, tokenizer, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) # Save a trained model, configuration and tokenizer model_to_save = self.model.module if hasattr( self.model, 'module') else self.model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(output_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) self.tokenizer.save_vocabulary(output_dir) def evaluate(self, print_preds): self.logger.info("***** Running evaluation *****") self.logger.info(" Num examples = %d", len(self.eval_examples)) self.logger.info(" Batch size = %d", self.args.eval_batch_size) self.model.eval() eval_loss = 0 nb_eval_steps = 0 nb_eval_examples = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( self.eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): logits = self.model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if self.output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, self.num_labels), label_ids.view(-1)) elif self.output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 nb_eval_examples += input_ids.size(0) logits = logits.detach().cpu().numpy() if len(preds) == 0: preds.append(logits) else: preds[0] = np.append(preds[0], logits, axis=0) eval_loss = eval_loss / nb_eval_steps eval_loss_examples = eval_loss / nb_eval_examples preds = preds[0] if self.output_mode == "classification": preds = np.argmax(preds, axis=1) elif self.output_mode == "regression": preds = np.squeeze(preds) eval_all_label_ids_numpy = self.eval_all_label_ids.numpy() result = compute_metrics("many_metrics", preds, eval_all_label_ids_numpy) if print_preds: for i, pred in enumerate(preds): print("i = {}\t\tPredicted = {}\t\tActual = {}".format( i, pred, eval_all_label_ids_numpy[i])) result['eval_loss'] = eval_loss result['eval_loss_examples'] = eval_loss_examples return result, preds def run(self): self.logger.info("***** Running *****") self.logger.info(" Num examples = %d", len(self.run_examples)) self.logger.info(" Batch size = %d", self.args.eval_batch_size) self.model.eval() preds = [] for input_ids, input_mask, segment_ids in tqdm(self.run_dataloader, desc="Evaluating"): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) with torch.no_grad(): logits = self.model(input_ids, segment_ids, input_mask, labels=None) logits = logits.detach().cpu().numpy() if len(preds) == 0: preds.append(logits) else: preds[0] = np.append(preds[0], logits, axis=0) preds = preds[0] if self.output_mode == "classification": preds = np.argmax(preds[:, :self.args.included_labels], axis=1) elif self.output_mode == "regression": preds = np.squeeze(preds) return preds def save_result(self, result, output_eval_dir): output_eval_file = os.path.join(output_eval_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: self.logger.info("***** Eval results *****") for key in sorted(result.keys()): self.logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) def preprare_distant_debugging(self): if self.args.server_ip and self.args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(self.args.server_ip, self.args.server_port), redirect_output=True) ptvsd.wait_for_attach() def prepare_device(self): if self.args.local_rank == -1 or self.args.no_cuda: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.args.no_cuda else "cpu") self.n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(self.args.local_rank) self.device = torch.device("cuda", self.args.local_rank) self.n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') def prepare_logging(self): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if self.args.local_rank in [-1, 0] else logging.WARN) self.logger = logging.getLogger(__name__) self.logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}" .format(self.device, self.n_gpu, bool(self.args.local_rank != -1), self.args.fp16)) def seed(self): random.seed(self.args.seed) np.random.seed(self.args.seed) torch.manual_seed(self.args.seed) if self.n_gpu > 0: torch.cuda.manual_seed_all(self.args.seed) def prepare_model(self): model_dir = self.args.resume_dir if self.args.resume_dir else self.args.bert_model cache_dir = self.args.cache_dir if self.args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( self.args.local_rank)) self.model = BertForSequenceClassification.from_pretrained( model_dir, cache_dir=cache_dir, num_labels=self.num_labels) self.tokenizer = BertTokenizer.from_pretrained( model_dir, do_lower_case=self.args.do_lower_case) if self.args.fp16: self.model.half() print(self.device) self.model.to(self.device) if self.args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) self.model = DDP(self.model) elif self.n_gpu > 1: self.model = torch.nn.DataParallel(self.model) def prepare_optimizer(self): param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if self.args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) self.optimizer = FusedAdam(optimizer_grouped_parameters, lr=self.args.learning_rate, bias_correction=False, max_grad_norm=1.0) self.warmup_schedule = WarmupLinearSchedule( warmup=self.args.warmup_proportion, t_total=self.num_train_optimization_steps) if self.args.loss_scale == 0: self.optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: self.optimizer = FP16_Optimizer( optimizer, static_loss_scale=self.args.loss_scale) else: self.optimizer = BertAdam( optimizer_grouped_parameters, lr=self.args.learning_rate, warmup=self.args.warmup_proportion, t_total=self.num_train_optimization_steps, weight_decay=0.01) def prepare_train_examples(self): self.train_examples = self.processor.get_train_examples( self.args.data_dir, self.args.train_file) self.num_train_optimization_steps = int( len(self.train_examples) / self.args.train_batch_size / self.args.gradient_accumulation_steps) * self.args.num_train_epochs if self.args.local_rank != -1: self.num_train_optimization_steps = self.num_train_optimization_steps // torch.distributed.get_world_size( ) weights, augmented_weights = self.processor.get_train_weights() self.label_weights = augmented_weights print("label_weights = {}".format(self.label_weights)) input_length_arr = [] if self.processor.is_pair(): truncate_seq_pair = lambda tokens_a, tokens_b, max_length: self.processor.truncate_seq_pair( tokens_a, tokens_b, max_length) train_features = convert_examples_to_features( self.train_examples, self.label_list, self.args.max_seq_length, self.tokenizer, self.output_mode, self.logger, input_length_arr, truncate_seq_pair=truncate_seq_pair) else: train_features = convert_examples_to_features( self.train_examples, self.label_list, self.args.max_seq_length, self.tokenizer, self.output_mode, self.logger, input_length_arr) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) input_length_arr = np.array(input_length_arr) print("Train input_length_arr: max={}, min={}, avg={}".format( np.max(input_length_arr), np.min(input_length_arr), np.mean(input_length_arr))) if self.output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif self.output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if self.args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) self.train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=self.args.train_batch_size) def preprare_eval_examples(self): self.eval_examples = self.processor.get_dev_examples( self.args.data_dir, self.args.dev_file) input_length_arr = [] if self.processor.is_pair(): truncate_seq_pair = lambda tokens_a, tokens_b, max_length: self.processor.truncate_seq_pair( tokens_a, tokens_b, max_length) self.eval_features = convert_examples_to_features( self.eval_examples, self.label_list, self.args.max_seq_length, self.tokenizer, self.output_mode, self.logger, input_length_arr, truncate_seq_pair=truncate_seq_pair) else: self.eval_features = convert_examples_to_features( self.eval_examples, self.label_list, self.args.max_seq_length, self.tokenizer, self.output_mode, self.logger, input_length_arr) all_input_ids = torch.tensor([f.input_ids for f in self.eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in self.eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in self.eval_features], dtype=torch.long) input_length_arr = np.array(input_length_arr) print("Eval input_length_arr: max={}, min={}, avg={}".format( np.max(input_length_arr), np.min(input_length_arr), np.mean(input_length_arr))) if self.output_mode == "classification": self.eval_all_label_ids = torch.tensor( [f.label_id for f in self.eval_features], dtype=torch.long) elif self.output_mode == "regression": self.eval_all_label_ids = torch.tensor( [f.label_id for f in self.eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, self.eval_all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) self.eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.args.eval_batch_size) def prepare_run_examples(self): self.run_examples = self.processor.get_dev_examples( self.args.data_dir, self.args.dev_file) input_length_arr = [] if self.processor.is_pair(): truncate_seq_pair = lambda tokens_a, tokens_b, max_length: self.processor.truncate_seq_pair( tokens_a, tokens_b, max_length) self.run_features = convert_examples_to_features( self.run_examples, self.label_list, self.args.max_seq_length, self.tokenizer, self.output_mode, self.logger, input_length_arr, truncate_seq_pair=truncate_seq_pair) else: self.run_features = convert_examples_to_features( self.run_examples, self.label_list, self.args.max_seq_length, self.tokenizer, self.output_mode, self.logger, input_length_arr) all_input_ids = torch.tensor([f.input_ids for f in self.run_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in self.run_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in self.run_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) self.run_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.args.eval_batch_size) def train(self): self.logger.info("***** Running training *****") self.logger.info(" Num examples = %d", len(self.train_examples)) self.logger.info(" Batch size = %d", self.args.train_batch_size) self.logger.info(" Num steps = %d", self.num_train_optimization_steps) if self.output_mode == "classification": print("label_weights = {}".format(self.label_weights)) label_weights = torch.tensor(self.label_weights).float().to( self.device) loss_fct = CrossEntropyLoss(weight=label_weights) elif self.output_mode == "regression": loss_fct = MSELoss() self.model.train() global_step = 0 nb_tr_steps = 0 tr_loss = 0 for epoch in trange(int(self.args.num_train_epochs), desc="Epoch"): self.model.train() if epoch == self.args.resume_epochs - 1: time.sleep(1) tqdm.write("\nEpoch {} previously done\n".format(epoch)) if epoch < self.args.resume_epochs: continue elif epoch == self.args.resume_epochs: tqdm.write("\nResuming Epoch {}.\n".format(epoch)) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # start_time = time.time() for step, batch in enumerate( tqdm(self.train_dataloader, desc="Iteration")): # print("Reading Batch in {}".format(time.time() - start_time)) if epoch == self.args.resume_epochs and step == self.args.resume_steps - 1: time.sleep(1) tqdm.write( "\nStep {} of epoch {} previously done\n".format( step, epoch)) if epoch == self.args.resume_epochs: if step < self.args.resume_steps: continue elif step == self.args.resume_steps: tqdm.write("\nResuming step {} from epoch {}.".format( step, epoch)) # start_time = time.time() batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # print("Preparing Batch in {}".format(time.time() - start_time)) # define a new function to compute loss values for both output_modes # start_time = time.time() logits = self.model(input_ids, segment_ids, input_mask, labels=None) # print("Execute model in {}".format(time.time() - start_time)) # start_time = time.time() if self.output_mode == "classification": loss = loss_fct(logits.view(-1, self.num_labels), label_ids.view(-1)) elif self.output_mode == "regression": loss = loss_fct(logits.view(-1), label_ids.view(-1)) if self.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps # print("Calculate loss in {}".format(time.time() - start_time)) # start_time = time.time() if self.args.fp16: self.optimizer.backward(loss) else: loss.backward() # print("Backword loss in {}".format(time.time() - start_time)) # start_time = time.time() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % self.args.gradient_accumulation_steps == 0: if self.args.fp16: # modify learning rate with special warm up BERT uses # if self.args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = self.args.learning_rate * warmup_schedule.get_lr( global_step / self.num_train_optimization_steps) # lr_this_step = self.args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, self.args.warmup_proportion) for param_group in self.optimizer.param_groups: param_group['lr'] = lr_this_step self.optimizer.step() self.optimizer.zero_grad() global_step += 1 # print("Optimize in {}".format(time.time() - start_time)) if (step + 1) % self.args.save_model_steps == 0: # Save model # ... step_output_dir = os.path.join( self.args.output_dir, "epoch_{}_step_{}".format(epoch, step)) self.save_model(self.model, self.tokenizer, step_output_dir) # start_time = time.time() # Save model at the end of Epoch # ... epoch_output_dir = os.path.join(self.args.output_dir, "epoch_{}".format(epoch)) self.save_model(self.model, self.tokenizer, epoch_output_dir) # Evaluate Epoch result, _ = self.evaluate(False) result['tr_loss'] = tr_loss / nb_tr_steps result['tr_loss_examples'] = tr_loss / nb_tr_examples self.save_result(result, epoch_output_dir) def execute(self): self.preprare_distant_debugging() self.prepare_device() self.prepare_logging() if self.args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(self.args.gradient_accumulation_steps)) self.args.train_batch_size = self.args.train_batch_size // self.args.gradient_accumulation_steps self.seed() if not self.args.do_train and not self.args.do_eval and not self.args.do_run: raise ValueError( "At least one of `do_train`, `do_eval` or `do_run` must be True." ) if self.args.do_train or self.args.do_eval: if not self.args.output_dir: raise ValueError("You must specify output directory") elif os.path.exists(self.args.output_dir) and os.listdir( self.args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.". format(self.args.output_dir)) if not os.path.exists(self.args.output_dir): os.makedirs(self.args.output_dir) self.processor = self.args.processor() self.output_mode = self.args.output_mode self.label_list = self.processor.get_labels() self.num_labels = len(self.label_list) self.prepare_model() if self.args.do_train: self.prepare_train_examples() self.prepare_optimizer() if self.args.do_train or self.args.do_eval: self.preprare_eval_examples() if self.args.do_train: self.train() if self.args.do_eval and (self.args.local_rank == -1 or torch.distributed.get_rank() == 0): result, _ = self.evaluate(False) self.save_result(result, args.output_dir) if self.args.do_run: self.prepare_run_examples() preds = self.run() self.processor.save_dev(self.args.data_dir, self.args.dev_file, self.args.result_file, self.run_examples, preds)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--model_recover_path", default=None, type=str, help="The file of fine-tuned pretraining model.") parser.add_argument("--optim_recover_path", default=None, type=str, help="The file of pretraining optimizer.") # Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() args.output_dir = args.output_dir.replace( '[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int( args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") os.makedirs(args.output_dir, exist_ok=True) json.dump(args.__dict__, open(os.path.join( args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) train_examples = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) amp_handle = None if args.fp16: from apex import amp amp_handle = amp.init(enable_caching=True) # Prepare model if (args.model_recover_path is None) or len(args.model_recover_path) == 0: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) else: if not Path(args.model_recover_path).exists(): logger.info("Path does not exist: {0}".format( args.model_recover_path)) sys.exit(0) logger.info( "***** Recover model: {0} *****".format(args.model_recover_path)) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=torch.load(args.model_recover_path), num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) if args.do_train: t_total = int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) else: t_total = 1 optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.fp16_utils.fp16_optimizer import FP16_Optimizer except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer( optimizer, static_loss_scale=args.loss_scale) logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.task_name == 'sts-b': if args.fp16: lbl_type = torch.half else: lbl_type = torch.float else: lbl_type = torch.long # if all epoch checkpoints exist, skip the whole training process all_exist = True for i_epoch in range(1, int(args.num_train_epochs)+1): output_model_file = os.path.join( args.output_dir, "model.{0}.bin".format(i_epoch)) if not Path(output_model_file).exists(): all_exist = False break global_step = 0 if args.do_train and (not all_exist): train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) all_input_ids = torch.tensor( [f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=lbl_type) train_data = TensorDataset( all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for i_epoch in trange(1, int(args.num_train_epochs)+1, desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)') for step, batch in enumerate(iter_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() tr_loss += loss.item() iter_bar.set_description('Iter (loss=%5.3f)' % loss.item()) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "model.{0}.bin".format(i_epoch)) torch.save(model_to_save.state_dict(), output_model_file) # delete unused variables del optimizer #del model del param_optimizer del optimizer_grouped_parameters # Load a trained model that you have fine-tuned if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): seg_result_dict = {} for i_epoch in trange(1, int(args.num_train_epochs)+1, desc="Epoch"): logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() del model output_model_file = os.path.join( args.output_dir, "model.{0}.bin".format(i_epoch)) model_state_dict = torch.load(output_model_file) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) eval_set_list = [] for eval_segment in processor.get_dev_segments(): eval_examples = processor.get_dev_examples( args.data_dir, segment=eval_segment) eval_set_list.append((eval_segment, eval_examples)) break for eval_segment, eval_examples in eval_set_list: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation: %s *****", eval_segment) logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=lbl_type) eval_data = TensorDataset( all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_result = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 all_logits, all_label_ids = [], [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model( input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) if amp_handle: amp_handle._clear_cache() logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() all_logits.append(logits) all_label_ids.append(label_ids) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps # compute evaluation metric all_logits = np.concatenate(all_logits, axis=0) all_label_ids = np.concatenate(all_label_ids, axis=0) metric_func = processor.get_metric_func() eval_result = metric_func(all_logits, all_label_ids) result = {'eval_loss': eval_loss, 'eval_result': eval_result, 'model': output_model_file, 'model_recover_path': args.model_recover_path, 'task_name': args.task_name, 'epoch': i_epoch, 'eval_segment': eval_segment} if eval_segment not in seg_result_dict: seg_result_dict[eval_segment] = [] seg_result_dict[eval_segment].append(result) # logging the results logger.info( "***** Eval results ({0}: {1}) *****".format(eval_segment, i_epoch)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) # dump predictions with open(os.path.join(args.output_dir, "{0}.{1}.pred".format(eval_segment, i_epoch)), "w") as f_out: for pred_it in processor.get_pred(all_logits): f_out.write(str(pred_it)) f_out.write('\n') for eval_segment, result_list in seg_result_dict.items(): with open(os.path.join(args.output_dir, eval_segment+".txt"), "w") as f_out: f_out.write(json.dumps(result_list, indent=2, sort_keys=True)) f_out.write('\n')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument( '--log_path', type=str, default="./log", help="The path for saving tensorboard logs. Default is ./log") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) processors = { "qe": MyProcessor, } task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() model_collections = Collections() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) model.to(device) # fine-tuning fine-tuning model # output_config_file = os.path.join(args.bert_model, CONFIG_NAME) # config = BertConfig(output_config_file) # model = BertForSequenceClassification(config) # # output_model_file = os.path.join(args.bert_model, WEIGHTS_NAME) # model_state_dict = torch.load(output_model_file) # model.load_state_dict(model_state_dict) # model.to(device) #----------------------------- if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) # ignores_names=['classifier.weight','classifier.bias'] # # base_params = [p for n, p in model.named_parameters() if not any(nd in n for nd in ignores_names)] # ignores_params=[p for n, p in model.named_parameters() if any(nd in n for nd in ignores_names)] # # optimizer = torch.optim.Adam([{'params': base_params}, # {'params': ignores_params, 'lr': args.learning_rate * 10}], # lr=args.learning_rate) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # Timer for computing speed timer_for_speed = Timer() timer_for_speed.tic() summary_writer = SummaryWriter(log_dir=args.log_path) is_early_stop = False disp_freq = 100 loss_valid_freq = 100 early_stop_patience = 10 bad_count = 0 nb_tr_examples, nb_tr_steps = 0, 0 for eidx in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() try: batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.enable_grad(): loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # model_collections.add_to_collection("train_losses", loss.item()) # summary_writer.add_scalar("train_losses", loss.item(), global_step=nb_tr_steps) # display some information if (nb_tr_steps % disp_freq == 0): lrate = list(optimizer.get_lr())[0] result = {'train_loss': loss.item(), "lrate": lrate} logger.info("***** train results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') # optimizer.zero_grad() if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: raise e # calculate dev loss if (nb_tr_steps % loss_valid_freq == 0): if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples( args.data_dir) eval_features = convert_examples_to_features( eval_examples, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 for bacth_eval in eval_dataloader: bacth_eval = tuple( t.to(device) for t in bacth_eval) input_ids, input_mask, segment_ids, label_ids = bacth_eval with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps model_collections.add_to_collection( "history_losses", eval_loss) min_history_loss = np.array( model_collections.get_collection( "history_losses")).min() summary_writer.add_scalar("loss", eval_loss, global_step=nb_tr_steps) summary_writer.add_scalar("best_loss", min_history_loss, global_step=nb_tr_steps) lrate = list(optimizer.get_lr())[0] summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=nb_tr_steps) best_eval_loss = min_history_loss # If model get new best valid loss # save model & early stop if eval_loss <= best_eval_loss: bad_count = 0 if is_early_stop is False: # Save a trained model # Only save the model it-self model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join( args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write( model_to_save.config.to_json_string()) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= early_stop_patience and eidx > 0: is_early_stop = True logger.info("Early Stop!") summary_writer.add_scalar("bad_count", bad_count, nb_tr_steps) logger.info("{0} Loss: {1:.4f} patience: {2}".format( nb_tr_steps, eval_loss, bad_count)) if is_early_stop == True: break
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--model_dir", default=None, type=str, required=True, help="The model directory where the ner model pretrained.") # Other parameters parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # n_gpu = torch.cuda.device_count() n_gpu = 1 else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = PosProcessor() label_list = processor.get_labels() num_labels = len(label_list) + 1 train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = ner2pos(args.model_dir) if args.fp16: model.half() model.to(device) model.ner_module.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # param_optimizer = list(model.named_parameters()) param_optimizer = list(model.named_parameters()) optim_params = ['classifier.bias', 'classifier.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if any(nd in n for nd in optim_params)], 'weight_decay': 0.0 }] # optimizer = optim.SGD(optimizer_grouped_parameters['params'], lr=args.learning_rate) optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if n_gpu > 1: max_length = model.module.ner_module.model_config["max_seq_length"] tokenizer = model.module.ner_module.tokenizer else: max_length = model.ner_module.model_config["max_seq_length"] tokenizer = model.ner_module.tokenizer if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, max_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, 'ner_pos.bin') torch.save(model_to_save.state_dict(), output_model_file) # output_config_file = os.path.join(args.output_dir, CONFIG_NAME) # with open(output_config_file, 'w') as f: # f.write(model_to_save.config.to_json_string()) # label_map = {i: label for i, label in enumerate(label_list, 1)} # model_config = { # "bert_model": args.bert_model, # "do_lower": args.do_lower_case, # "max_seq_length": args.max_seq_length, # "num_labels": len(label_list) + 1, # "label_map": label_map # } # json.dump( # model_config, # open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: # output_config_file = os.path.join(args.output_dir, CONFIG_NAME) output_model_file = os.path.join(args.output_dir, 'ner_pos.bin') # config = BertConfig(output_config_file) # model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, max_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, mask in enumerate(input_mask): temp_1 = [] temp_2 = [] for j, m in enumerate(mask): if j == 0: continue if m: if label_map[label_ids[i][j]] != "X": temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) else: temp_1.pop() temp_2.pop() break y_true.append(temp_1) y_pred.append(temp_2) report = classification_report(y_true, y_pred, digits=4) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=32, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--tag_space", default=128, type=int, help="dimension of linear transformation.") parser.add_argument("--rnn_hidden_size", default=None, type=int, help="dimension of document level rnn layer.") parser.add_argument( "--dropout", default=0.1, type=float, help="dropout for outputs other than the original bert model") parser.add_argument("--use_crf", action='store_true', help="Whether to use crf layer.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_resume", action='store_true', help="Whether to run eval on the resumed pretrained model.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=16, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--adv_reg_coeff', default='0.0', type=float, help='Regularization coefficient of adversarial loss') parser.add_argument( '--va_reg_coeff', default='0.0', type=float, help='Regularization coefficient of virtual adversarial loss') parser.add_argument('--adv_perturb_norm_length', default='8.0', type=float, help='Norm length of adversarial perturbation to be') parser.add_argument( '--va_perturb_norm_length', default='4.0', type=float, help='Norm length of virtual adversarial perturbation to be') args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"pico": PICOProcessor, "nicta": NICTAProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() label_map = {label: i for i, label in enumerate(label_list)} num_labels = len(label_map) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs num_train_optimization_steps_epoch = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequentialClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels, tag_space=args.tag_space, use_crf=args.use_crf, rnn_hidden_size=args.rnn_hidden_size, dropout=args.dropout) # print(count_parameters(model)) # if args.fp16: # model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # if args.fp16: # try: # from apex.optimizers import FP16_Optimizer # from apex.optimizers import FusedAdam # except ImportError: # raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") # # optimizer = FusedAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # bias_correction=False, # max_grad_norm=1.0) # if args.loss_scale == 0: # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) # else: # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) # # else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(minibatches(train_examples, label_map, tokenizer, args.train_batch_size, args.max_seq_length, shuffle=True), desc="Iteration", total=num_train_optimization_steps_epoch)): input_ids = torch.tensor(batch.input_ids, dtype=torch.long).to(device) segment_ids = torch.tensor(batch.segment_ids, dtype=torch.long).to(device) input_mask = torch.tensor(batch.input_mask, dtype=torch.long).to(device) label_ids = torch.tensor(batch.label_ids, dtype=torch.long).to(device) document_mask = torch.tensor(batch.document_mask, dtype=torch.float).to(device) loss, logits, embeddings = model(input_ids, segment_ids, input_mask, document_mask, label_ids) if args.adv_reg_coeff: adv_loss = adversarial_loss( embeddings, segment_ids, input_mask, document_mask, label_ids, loss, model, args.adv_perturb_norm_length)[0] loss += args.adv_reg_coeff * adv_loss if args.va_reg_coeff: va_loss = virtual_adversarial_loss( logits, embeddings, segment_ids, input_mask, document_mask, num_labels, model, args.va_perturb_norm_length) loss += args.va_reg_coeff * va_loss # if n_gpu > 1: # loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # if args.fp16: # # modify learning rate with special warm up BERT uses # # if args.fp16 is False, BertAdam is used that handles this automatically # lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, # args.warmup_proportion) # for param_group in optimizer.param_groups: # param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequentialClassification( config, num_labels=num_labels, tag_space=args.tag_space, use_crf=args.use_crf, rnn_hidden_size=args.rnn_hidden_size, dropout=args.dropout) model.load_state_dict(torch.load(output_model_file)) elif args.do_resume: output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequentialClassification( config, num_labels=num_labels, tag_space=args.tag_space, use_crf=args.use_crf, rnn_hidden_size=args.rnn_hidden_size, dropout=args.dropout) model.load_state_dict(torch.load(output_model_file)) else: model = BertForSequentialClassification.from_pretrained( args.bert_model, num_labels=num_labels, tag_space=args.tag_space, use_crf=args.use_crf, rnn_hidden_size=args.rnn_hidden_size, dropout=args.dropout) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_test_examples( args.data_dir) ## eval on dev/test sets logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() preds_all = [] labels_all = [] for step, batch in enumerate( tqdm(minibatches(eval_examples, label_map, tokenizer, args.eval_batch_size, args.max_seq_length, shuffle=False), desc="Evaluating")): input_ids = torch.tensor(batch.input_ids, dtype=torch.long).to(device) segment_ids = torch.tensor(batch.segment_ids, dtype=torch.long).to(device) input_mask = torch.tensor(batch.input_mask, dtype=torch.long).to(device) document_mask = torch.tensor(batch.document_mask, dtype=torch.float).to(device) with torch.no_grad(): preds, _, _ = model(input_ids, segment_ids, input_mask, document_mask) preds = preds.cpu().tolist() document_lens = np.sum(batch.document_mask, axis=1) for pred, label, document_len in zip(preds, batch.label_ids, document_lens): preds_all += pred[:document_len] labels_all += label[:document_len] eval_acc, eval_prec, eval_recall, eval_f1 = accuracy( preds_all, labels_all) print(confusion_matrix(labels_all, preds_all)) eval_sents = [ sent for example in eval_examples for sent in example.document ] with open(os.path.join(args.output_dir, 'eval_text'), 'w') as ofile: for sent, pred, label in zip(eval_sents, preds_all, labels_all): ofile.write('{}\t{}\t{}\n'.format(label_list[label], label_list[pred], sent)) loss = tr_loss / nb_tr_steps if args.do_train else None result = {'global_step': global_step, 'loss': loss} for tag in processor.get_labels(): result.update({ tag: { "precision": eval_prec[label_map[tag]], "recall": eval_recall[label_map[tag]], "f1": eval_f1[label_map[tag]] } }) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") fold_num = args.output_dir.split('/')[-1] params_log = ', '.join(['{}: {}'.format(attr, getattr(args, attr)) for attr in dir(args) \ if not callable(getattr(args, attr)) and not attr.startswith("__")]) with open(output_eval_file, "a") as writer: logger.info("***** Eval results for Fold {}*****".format(fold_num)) writer.write( "\n***** Eval results for Fold {}*****\n".format(fold_num)) writer.write(params_log + '\n') for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))