def model_train(bert_model, max_seq_length, do_lower_case, num_train_epochs, train_batch_size, gradient_accumulation_steps, learning_rate, weight_decay, loss_scale, warmup_proportion, processor, device, n_gpu, fp16, cache_dir, local_rank, dry_run, no_cuda, output_dir=None): label_map = processor.get_labels() if gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(gradient_accumulation_steps)) train_batch_size = train_batch_size // gradient_accumulation_steps train_dataloader = processor.get_train_examples(train_batch_size, local_rank) # Batch sampler divides by batch_size! num_train_optimization_steps = int( len(train_dataloader) * num_train_epochs / gradient_accumulation_steps) if local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = cache_dir if cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(local_rank)) model = BertForTokenClassification.from_pretrained( bert_model, cache_dir=cache_dir, num_labels=len(label_map)) if fp16: model.half() model.to(device) if local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=learning_rate, bias_correction=False, max_grad_norm=1.0) if loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale) warmup_linear = WarmupLinearSchedule( warmup=warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) warmup_linear = None global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataloader)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) logger.info(" Num epochs = %d", num_train_epochs) model_config = { "bert_model": bert_model, "do_lower": do_lower_case, "max_seq_length": max_seq_length, "label_map": label_map } def save_model(lh): if output_dir is None: return output_model_file = os.path.join(output_dir, "pytorch_model_ep{}.bin".format(ep)) # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) json.dump(model_config, open(os.path.join(output_dir, "model_config.json"), "w")) lh = pd.DataFrame(lh, columns=['global_step', 'loss']) loss_history_file = os.path.join(output_dir, "loss_ep{}.pkl".format(ep)) lh.to_pickle(loss_history_file) def load_model(epoch): if output_dir is None: return False output_model_file = os.path.join( output_dir, "pytorch_model_ep{}.bin".format(epoch)) if not os.path.exists(output_model_file): return False logger.info("Loading epoch {} from disk...".format(epoch)) model.load_state_dict( torch.load(output_model_file, map_location=lambda storage, loc: storage if no_cuda else None)) return True model.train() for ep in trange(1, int(num_train_epochs) + 1, desc="Epoch"): if dry_run and ep > 1: logger.info("Dry run. Stop.") break if load_model(ep): global_step += len(train_dataloader) // gradient_accumulation_steps continue loss_history = list() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {ep}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps if fp16: optimizer.backward(loss) else: loss.backward() loss_history.append((global_step, loss.item())) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if dry_run and len(loss_history) > 2: logger.info("Dry run. Stop.") break if (step + 1) % gradient_accumulation_steps == 0: if fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = learning_rate * warmup_linear.get_lr( global_step, warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 save_model(loss_history) return model, model_config
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--train_size", default=99999999999, type=int, help="The size of the training dataset.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: if (task_name == "sst-2"): train_examples = processor.get_train_examples( args.data_dir, args.train_size) else: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) warmup_linear_schedule = WarmupLinearSchedule( args.warmup_proportion, num_train_optimization_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() print("tr_loss = {}".format(tr_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear_schedule.get_lr( global_step) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def pretrain_on_domain(args): assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"{BERT_PRETRAINED_MODEL}_epoch_{i}.json" metrics_file = args.pregenerated_data / f"{BERT_PRETRAINED_MODEL}_epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).") print("This script will loop over the available data, but training diversity may be negatively impacted.") num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!") args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int( total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForMLMPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_train_optimization_steps) loss_dict = defaultdict(list) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=NUM_CPU) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, lm_label_ids = batch outputs = model(input_ids=input_ids, attention_mask=input_mask, masked_lm_labels=lm_label_ids) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 loss_dict["epoch"].append(epoch) loss_dict["batch_id"].append(step) loss_dict["mlm_loss"].append(loss.item()) # Save a trained model if epoch < num_data_epochs and (n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <= 1): logging.info("** ** * Saving fine-tuned model ** ** * ") epoch_output_dir = args.output_dir / f"epoch_{epoch}" epoch_output_dir.mkdir(parents=True, exist_ok=True) model.save_pretrained(epoch_output_dir) tokenizer.save_pretrained(epoch_output_dir) # Save a trained model if n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <=1: logging.info("** ** * Saving fine-tuned model ** ** * ") model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) df = pd.DataFrame.from_dict(loss_dict) df.to_csv(args.output_dir/"losses.csv")
shuffle=False, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(dataset, batch_size=1024, shuffle=False, sampler=validation_sampler) stackedgru = StackedGRU() stackedgru = stackedgru.to(device) #stackedgru = torch.nn.DataParallel(stackedgru, # device_ids=['cuda:2', 'cuda:3', 'cuda:4', 'cuda:5']) if enable_amp: #amp.register_float_function(torch, 'sigmoid') from apex.optimizers import FusedAdam optimizer = FusedAdam(stackedgru.parameters()) stackedgru, optimizer = amp.initialize(stackedgru, optimizer, opt_level='O2') else: optimizer = torch.optim.Adam(stackedgru.parameters()) model = Model(stackedgru, optimizer, model_path='./phaselink_model') print("Begin training process.") model.train(train_loader, val_loader, n_epochs, enable_amp=enable_amp) # And select and assign best model: find_best_model(model_path="phaselink_model") print("Finished.")
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, default="bert-base-uncased", help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 # Save a trained model if n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <= 1: logging.info("** ** * Saving fine-tuned model ** ** * ") model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--do_lower_case", action="store_true") parser.add_argument("--reduce_memory", action="store_true", help="Store training data as on-disc memmaps to massively reduce memory usage") parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}_train.json" metrics_file = args.pregenerated_data / f"epoch_{i}_train_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).") print("This script will loop over the available data, but training diversity may be negatively impacted.") num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # initialize random seed so that training results could be reproduced (Jiajun Bao) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!") args.output_dir.mkdir(parents=True, exist_ok=True) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int( total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) args.n_gpu = n_gpu args.device = device args.num_data_epochs = num_data_epochs args.num_train_optimization_steps = num_train_optimization_steps args.total_train_examples = total_train_examples train(args, model, optimizer, scheduler)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .csv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--init_ckpt", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_dir = os.path.join(args.data_dir, 'train') train_examples = read_race_examples([train_dir+'/high', train_dir+'/middle']) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForMultipleChoice.from_pretrained(args.init_ckpt, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_choices=4) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in range(int(args.num_train_epochs)): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("Trianing Epoch: {}/{}".format(ep+1, int(args.num_train_epochs))) for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if global_step%100 == 0: logger.info("Training loss: {}, global step: {}".format(tr_loss/nb_tr_steps, global_step)) ## evaluate on dev set if global_step % 1000 == 0: dev_dir = os.path.join(args.data_dir, 'dev') dev_set = [dev_dir+'/high', dev_dir+'/middle'] eval_examples = read_race_examples(dev_set) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: Dev *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'dev_eval_loss': eval_loss, 'dev_eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Dev results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) ## Load a trained model that you have fine-tuned ## use this part if you want to load the trained model # model_state_dict = torch.load(output_model_file) # model = BertForMultipleChoice.from_pretrained(args.bert_model, # state_dict=model_state_dict, # num_choices=4) # model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_dir = os.path.join(args.data_dir, 'test') test_high = [test_dir + '/high'] test_middle = [test_dir + '/middle'] ## test high eval_examples = read_race_examples(test_high) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test high *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() high_eval_loss, high_eval_accuracy = 0, 0 high_nb_eval_steps, high_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) high_eval_loss += tmp_eval_loss.mean().item() high_eval_accuracy += tmp_eval_accuracy high_nb_eval_examples += input_ids.size(0) high_nb_eval_steps += 1 eval_loss = high_eval_loss / high_nb_eval_steps eval_accuracy = high_eval_accuracy / high_nb_eval_examples result = {'high_eval_loss': eval_loss, 'high_eval_accuracy': eval_accuracy} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## test middle eval_examples = read_race_examples(test_middle) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test middle *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() middle_eval_loss, middle_eval_accuracy = 0, 0 middle_nb_eval_steps, middle_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) middle_eval_loss += tmp_eval_loss.mean().item() middle_eval_accuracy += tmp_eval_accuracy middle_nb_eval_examples += input_ids.size(0) middle_nb_eval_steps += 1 eval_loss = middle_eval_loss / middle_nb_eval_steps eval_accuracy = middle_eval_accuracy / middle_nb_eval_examples result = {'middle_eval_loss': eval_loss, 'middle_eval_accuracy': eval_accuracy} with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## all test eval_loss = (middle_eval_loss + high_eval_loss) / (middle_nb_eval_steps + high_nb_eval_steps) eval_accuracy = (middle_eval_accuracy + high_eval_accuracy) / (middle_nb_eval_examples + high_nb_eval_examples) result = {'overall_eval_loss': eval_loss, 'overall_eval_accuracy': eval_accuracy} with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def set_model(self, inference=False, with_head=False, from_path=None, output_attention=False): self.verbose('Initializing Mockingjay model.') # uild the Mockingjay model with speech prediction head self.model_config = MockingjayConfig(self.config) self.dr = self.model_config.downsample_rate self.hidden_size = self.model_config.hidden_size self.output_attention = output_attention if not inference or with_head: self.model = MockingjayForMaskedAcousticModel( self.model_config, self.input_dim, self.output_dim, self.output_attention).to(self.device) self.verbose('Number of parameters: ' + str( sum(p.numel() for p in self.model.parameters() if p.requires_grad))) self.mockingjay = self.model.Mockingjay if inference and not with_head: self.mockingjay = MockingjayModel( self.model_config, self.input_dim, self.output_attention).to(self.device) self.verbose('Number of parameters: ' + str( sum(p.numel() for p in self.mockingjay.parameters() if p.requires_grad))) self.mockingjay.eval() elif inference and with_head: self.model.eval() elif not inference: self.model.train() # Setup optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] num_train_optimization_steps = self.total_steps // self.gradient_accumulation_steps if self.apex: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=self.learning_rate, bias_correction=False, max_grad_norm=1.0) if self.config['optimizer']['loss_scale'] == 0: self.optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: self.optimizer = FP16_Optimizer( optimizer, static_loss_scale=self.config['optimizer'] ['loss_scale']) self.warmup_linear = WarmupLinearSchedule( warmup=self.warmup_proportion, t_total=num_train_optimization_steps) else: self.optimizer = BertAdam(optimizer_grouped_parameters, lr=self.learning_rate, warmup=self.warmup_proportion, t_total=num_train_optimization_steps) else: raise NotImplementedError('Invalid Arguments!') if self.load: # This will be set to True by default when Tester is running set_model() self.load_model(inference=inference, with_head=with_head, from_path=from_path)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir.") parser.add_argument("--train_file", default=None, type=str) parser.add_argument("--dev_file", default=None, type=str) parser.add_argument("--test_file", default=None, type=str) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() processors = { "ukp_aspects": UKPAspectsProcessor, "misra": MisraProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) with open(os.path.join(args.output_dir, 'config.txt'), 'w') as fOut: fOut.write(str(args)) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_examples(args.data_dir, args.train_file, args.dev_file, args.test_file, 'train') num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = SigmoidBERT.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: with open(os.path.join(args.output_dir, "train_sentences.csv"), "w") as writer: for idx, example in enumerate(train_examples): writer.write("%s\t%s\t%s\n" % (example.label, example.text_a, example.text_b)) train_features = convert_examples_to_features(train_examples, args.max_seq_length, tokenizer) logger.info("\n\n***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 #Dev set if args.dev_file is not None: eval_set = 'dev' eval_results_filename = "%s_results_epoch_%d.txt" % (eval_set, epoch + 1) eval_prediction_filename = "%s_predictions_epoch_%d.tsv" % ( eval_set, epoch + 1) do_evaluation(processor, args, tokenizer, model, device, global_step, eval_set, eval_results_filename, eval_prediction_filename) # Test set if args.test_file is not None: eval_set = 'test' eval_results_filename = "%s_results_epoch_%d.txt" % (eval_set, epoch + 1) eval_prediction_filename = "%s_predictions_epoch_%d.tsv" % ( eval_set, epoch + 1) do_evaluation(processor, args, tokenizer, model, device, global_step, eval_set, eval_results_filename, eval_prediction_filename) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--ernie_model", default=None, type=str, required=True, help="Ernie pre-trained model") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.0, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--threshold', type=float, default=.3) parser.add_argument("--vec_file", default=None, type=str, required=True, help="File with embeddings") parser.add_argument("--qid_file", default=None, type=str, required=True, help="File with qid mapping") parser.add_argument("--use_lim_ents", default=None, type=str, required=True, help="Whether to use limited entities") args = parser.parse_args() processors = TacredProcessor num_labels_task = 80 if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) processor = processors() label_list = None tokenizer = BertTokenizer.from_pretrained(args.ernie_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None train_examples, label_list = processor.get_train_examples(args.data_dir) num_labels = len(label_list) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model, _ = BertForSequenceClassification.from_pretrained( args.ernie_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels) # if args.fp16: # model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_grad = [ 'bert.encoder.layer.11.output.dense_ent', 'bert.encoder.layer.11.output.LayerNorm_ent' ] param_optimizer = [(n, p) for n, p in param_optimizer if not any(nd in n for nd in no_grad)] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex import amp from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=True) if args.loss_scale == 0: model, optimizer = amp.initialize(model, optimizer, opt_level="O2") # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, args.threshold, args.qid_file) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) # check for limited ents lim_ents = [] lim_check = (args.use_lim_ents == "y") if lim_check: lim_ents = lim_ent_map(0, "kg_embeddings/dbp_eid_2_wd_eid.txt") logger.info( "Limited entities flag is on. Count of unique entities considered: " + str(len(lim_ents))) vecs = [] vecs.append([0] * 100) # CLS lineindex = 1 uid_map = {} logger.info("Reading embeddings file.") with open(args.vec_file, 'r') as fin: for line in fin: vec = line.strip().split('\t') # first element is unique id uniqid = int(vec[0]) # map line index to unique id uid_map[uniqid] = lineindex # increment line index lineindex = lineindex + 1 if (lim_check and (uniqid in lim_ents)) or not lim_check: vec = [float(x) for x in vec[1:101]] else: vec = vecs[0] vecs.append(vec) embed = torch.FloatTensor(vecs) embed = torch.nn.Embedding.from_pretrained(embed) #embed = torch.nn.Embedding(5041175, 100) logger.info("Shape of entity embedding: " + str(embed.weight.size())) del vecs # zeros = [0 for _ in range(args.max_seq_length)] # zeros_ent = [0 for _ in range(100)] # zeros_ent = [zeros_ent for _ in range(args.max_seq_length)] all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_ent = torch.tensor([f.input_ent for f in train_features], dtype=torch.long) all_ent_masks = torch.tensor([f.ent_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_ent, all_ent_masks, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) output_loss_file = os.path.join(args.output_dir, "loss") loss_fout = open(output_loss_file, 'w') model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple( t.to(device) if i != 3 else t for i, t in enumerate(batch)) input_ids, input_mask, segment_ids, input_ent, ent_mask, label_ids = batch input_ent = embed(input_ent + 1).to(device) # -1 -> 0 loss = model(input_ids, segment_ids, input_mask, input_ent.half(), ent_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() loss_fout.write("{}\n".format(loss.item())) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join( args.output_dir, "pytorch_model.bin_{}".format(global_step)) torch.save(model_to_save.state_dict(), output_model_file) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file)
def set_model(self): print('[Runner] - Initializing Transformer model...') # uild the Transformer model with speech prediction head model_config = TransformerConfig(self.config) self.dr = model_config.downsample_rate self.hidden_size = model_config.hidden_size self.model = TransformerForMaskedAcousticModel( model_config, self.input_dim, self.output_dim).to(self.device) self.model.train() if self.args.multi_gpu: self.model = torch.nn.DataParallel(self.model) print('[Runner] - Multi-GPU training Enabled: ' + str(torch.cuda.device_count())) print('[Runner] - Number of parameters: ' + str( sum(p.numel() for p in self.model.parameters() if p.requires_grad))) # Setup optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if self.apex: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=self.learning_rate, bias_correction=False, max_grad_norm=1.0) if self.config['optimizer']['loss_scale'] == 0: self.optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: self.optimizer = FP16_Optimizer( optimizer, static_loss_scale=self.config['optimizer']['loss_scale']) self.warmup_linear = WarmupLinearSchedule( warmup=self.warmup_proportion, t_total=self.total_steps) else: self.optimizer = BertAdam(optimizer_grouped_parameters, lr=self.learning_rate, warmup=self.warmup_proportion, t_total=self.total_steps)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default='/home/adzuser/user_achyuta/BERT_NER_Test/BERT-NER/NERdata/', type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default='NER', type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default='ner_output', type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run test on the test set.") parser.add_argument("--do_pred", action='store_true', help="Whether to run pred on the pred set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--num_train_epochs", default=4.0, #3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--clip', type=float, default=0.5, help="gradient clipping") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--text_a', type=str, default='', help="input text_a.") parser.add_argument('--text_b', type=str, default='', help="input text_b.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} num_labels_task = { "ner": 17 #6#12 } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_pred: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) #train_examples = train_examples[:1000] print("train_examples :: ", len(list(train_examples))) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) #imodel = BertForSequenceClassification.from_pretrained(args.bert_model, # cache_dir=cache_dir, # num_labels = num_labels) model = BertForTokenClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() #model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) #all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) #all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) #all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) #all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_input_ids = [f.input_ids for f in train_features] all_input_mask = [f.input_mask for f in train_features] all_segment_ids = [f.segment_ids for f in train_features] all_label_ids = [f.label_id for f in train_features] # convert to cuda #all_input_ids = all_input_ids.to(device) #all_input_mask = all_input_mask.to(device) #all_segment_ids = all_segment_ids.to(device) #all_label_ids = all_label_ids.to(device) #train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) #if args.local_rank == -1: # train_sampler = RandomSampler(train_data) #else: # train_sampler = DistributedSampler(train_data) #train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # create model #model.train() model = KerasClassifier(build_fn=create_model(model), verbose=0) # define the grid search parameters batch_size = [10, 20, 40, 60, 80, 100] epochs = [10, 50, 100] param_grid = dict(batch_size=batch_size, epochs=epochs) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1) #keras_input = KerasInputFormatter([ # ('input_one', all_input_ids), # ('input_two', all_segment_ids), # ('input_three', all_input_mask),]) #input_one = Input(name = 'input_one', shape = (128,)) #input_two = Input(name = 'input_two', shape = (128,)) #input_three = Input(name = 'input_three', shape = (128,)) #input_four = Input(name = 'input_four', shape = (128,)) #output = y # define model here #self.model = Model(inputs = [input_one, input_two, input_three], outputs = output) grid_result = grid.fit( [[all_input_ids, all_segment_ids, all_input_mask]], [all_label_ids]) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) ''' model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch #print(input_ids.shape,input_mask.shape,segment_ids.shape,label_ids.shape) #print(input_ids[0]) #print(label_ids[0]) #logits = model(input_ids, segment_ids, input_mask) #import pdb;pdb.set_trace() #print(logits.view(-1, num_labels).shape, label_ids.view(-1).shape) loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() # added clip if args.clip is not None: _ = torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 ''' if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) #model = BertForSequenceClassification(config, num_labels=num_labels) model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: #model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) # Load a trained model and config that you have fine-tuned print('for eval only......................') output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) #model = BertForSequenceClassification(config, num_labels=num_labels) model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) #import pdb;pdb.set_trace() print("dev_eaxmples :: ", len(list(eval_examples))) eval_features = convert_examples_to_features_pred( eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions, true_labels = [], [] #predictions1 , true_labels1 = [], [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() # get index till '[SEP]' #print("label_list index SEP : ",label_list.index('[SEP]')) pred_xx = [list(p) for p in np.argmax(logits, axis=2)] pred_xx = [i[:i.index(label_list.index('[SEP]'))] for i in pred_xx] label_ids_xx = [ i[:i.index(label_list.index('[SEP]'))] for i in label_ids.tolist() ] #print(label_ids_xx) #print(pred_xx) # new add tmp_s = [ max(len(i), len(j)) for i, j in zip(label_ids_xx, pred_xx) ] tmp_u = [(i + [31] * (k - len(i)) if len(i) != k else i, j + [31] * (k - len(j)) if len(j) != k else j) for i, j, k in zip(label_ids_xx, pred_xx, tmp_s)] tmp_d1 = [h[0] for h in tmp_u] tmp_d2 = [h[1] for h in tmp_u] #print([list(p) for p in np.argmax(logits, axis=2)][:5]) #tmp_eval_accuracy = flat_accuracy(logits, label_ids) tmp_eval_accuracy = flat_accc(pred_xx, label_ids_xx) #tmp_eval_accuracy = flat_accc(tmp_d1, tmp_d2) predictions.extend(tmp_d2) true_labels.append(tmp_d1) #predictions1.extend(pred_xx) #true_labels1.append(label_ids_xx) #print("tmp accuracy : ",tmp_eval_accuracy) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None pred_tags = [[label_list[p_i] if p_i != 31 else 'XXX' for p_i in p] for p in predictions] valid_tags = [[ label_list[l_ii] if l_ii != 31 else 'YYY' for l_ii in l_i ] for l in true_labels for l_i in l] print("valid_tags : ", valid_tags[:10]) print("pred_tags : ", pred_tags[:10]) print("Validation F1-Score: {}".format(f1_score(valid_tags, pred_tags))) print("Validation accuracy_score : {}".format( accuracy_score(valid_tags, pred_tags))) print("Validation classification_report : {}".format( classification_report(valid_tags, pred_tags))) #print("X Validation F1-Score: {}".format(f1_score(true_labels1, predictions1))) #print("X Validation accuracy_score : {}".format(accuracy_score(true_labels1, predictions1))) #print("X Validation classification_report : {}".format(classification_report(true_labels1, predictions1))) result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } print(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_test_examples(args.data_dir) print('test examples len : {}'.format(len(eval_examples))) #import pdb;pdb.set_trace() eval_features = convert_examples_to_features_pred( eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() test_loss, test_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions, true_labels = [], [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() # get index till '[SEP]' #print("label_list index SEP : ",label_list.index('[SEP]')) pred_xx = [list(p) for p in np.argmax(logits, axis=2)] pred_xx = [i[:i.index(label_list.index('[SEP]'))] for i in pred_xx] label_ids_xx = [ i[:i.index(label_list.index('[SEP]'))] for i in label_ids.tolist() ] #print(label_ids_xx) #print(pred_xx) # new add tmp_s = [ max(len(i), len(j)) for i, j in zip(label_ids_xx, pred_xx) ] tmp_u = [(i + [31] * (k - len(i)) if len(i) != k else i, j + [31] * (k - len(j)) if len(j) != k else j) for i, j, k in zip(label_ids_xx, pred_xx, tmp_s)] tmp_d1 = [h[0] for h in tmp_u] tmp_d2 = [h[1] for h in tmp_u] #print([list(p) for p in np.argmax(logits, axis=2)][:5]) #tmp_eval_accuracy = flat_accuracy(logits, label_ids) tmp_eval_accuracy = flat_accc(pred_xx, label_ids_xx) #tmp_eval_accuracy = flat_accc(tmp_d1, tmp_d2) predictions.extend(tmp_d2) true_labels.append(tmp_d1) #print("tmp accuracy : ",tmp_eval_accuracy) test_loss += tmp_eval_loss.mean().item() test_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 test_loss = test_loss / nb_eval_steps test_accuracy = test_accuracy / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None pred_tags = [[label_list[p_i] if p_i != 31 else 'XXX' for p_i in p] for p in predictions] valid_tags = [[ label_list[l_ii] if l_ii != 31 else 'YYY' for l_ii in l_i ] for l in true_labels for l_i in l] print("valid_tags : ", valid_tags[:10]) print("pred_tags : ", pred_tags[:10]) print("Test F1-Score: {}".format(f1_score(valid_tags, pred_tags))) print("Test accuracy_score : {}".format( accuracy_score(valid_tags, pred_tags))) print("Test classification_report : {}".format( classification_report(valid_tags, pred_tags))) #print("X Test F1-Score: {}".format(f1_score(true_labels, predictions))) #print("X Test accuracy_score : {}".format(accuracy_score(true_labels, predictions))) #print("X Test classification_report : {}".format(classification_report(true_labels, predictions))) result = { 'test_loss': test_loss, 'test_accuracy': test_accuracy, 'global_step': global_step, 'loss': loss } print(result) output_test_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_pred and (args.local_rank == -1 or torch.distributed.get_rank() == 0): #eval_examples = processor.get_dev_examples(args.data_dir) model.eval() while True: print( 'enter a text to get NER. otherwise press Ctrl+C to close session.' ) text_a = input('>>>') #"Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday . ." eval_examples = { 'text_a': text_a, 'text_b': "The foodservice pie business does not fit our long-term growth strategy .", 'label': '1', 'guid': '12345' } eval_features = convert_examples_to_features_test( eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) #model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions, true_labels = [], [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() pred_xx = [list(p) for p in np.argmax(logits, axis=2)] pred_xx = [ i[:i.index(label_list.index('[SEP]'))] for i in pred_xx ] print(pred_xx) print([[label_list[p_i] if p_i != 31 else 'XXX' for p_i in p] for p in pred_xx])
def main(parser=None): if not parser: parser = argparse.ArgumentParser() # General parser.add_argument( "--bert_model", default="bert-base-cased", type=str, help= "Bert pre-trained model selected in the list: bert-base-cased, bert-large-cased." ) parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument( "--output_dir", default='tmp', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--log_file", default="training.log", type=str, help="The output directory where the log will be written.") parser.add_argument("--model_recover_path", default=None, type=str, help="The file of fine-tuned pretraining model.") parser.add_argument( "--do_train", action='store_true', help="Whether to run training. This should ALWAYS be set to True.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--label_smoothing", default=0, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="The weight decay rate for Adam.") parser.add_argument("--finetune_decay", action='store_true', help="Weight decay to the original weights.") parser.add_argument("--num_train_epochs", default=30, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--global_rank", type=int, default=-1, help="global_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp32_embedding', action='store_true', help= "Whether to use 32-bit float precision instead of 32-bit for embeddings" ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument( '--from_scratch', action='store_true', help= "Initialize parameters with random values (i.e., training from scratch)." ) parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument( '--max_len_a', type=int, default=0, help="Truncate_config: maximum length of segment A. 0 means none.") parser.add_argument('--max_len_b', type=int, default=20, help="Truncate_config: maximum length of segment B.") parser.add_argument( '--trunc_seg', default='b', help="Truncate_config: first truncate segment A/B (option: a, b).") parser.add_argument( '--always_truncate_tail', action='store_true', help="Truncate_config: Whether we should always truncate tail.") parser.add_argument( "--mask_prob", default=0.15, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument('--max_pred', type=int, default=3, help="Max tokens of prediction.") parser.add_argument("--num_workers", default=4, type=int, help="Number of workers for the data loader.") parser.add_argument('--max_position_embeddings', type=int, default=None, help="max position embeddings") # Others for VLP parser.add_argument( "--src_file", default=['/mnt/dat/COCO/annotations/dataset_coco.json'], type=str, nargs='+', help="The input data file name.") parser.add_argument('--len_vis_input', type=int, default=100) parser.add_argument('--enable_visdom', action='store_true') parser.add_argument('--visdom_port', type=int, default=8888) # parser.add_argument('--resnet_model', type=str, default='imagenet_weights/resnet101.pth') parser.add_argument('--image_root', type=str, default='/mnt/dat/COCO/images') parser.add_argument('--dataset', default='coco', type=str, help='coco | flickr30k | cc') parser.add_argument('--split', type=str, nargs='+', default=['train', 'restval']) parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes') parser.add_argument('--dist_url', default='file://[PT_OUTPUT_DIR]/nonexistent_file', type=str, help='url used to set up distributed training') parser.add_argument( '--file_valid_jpgs', default='/mnt/dat/COCO/annotations/coco_valid_jpgs.json', type=str) parser.add_argument('--sche_mode', default='warmup_linear', type=str, help="warmup_linear | warmup_constant | warmup_cosine") parser.add_argument('--drop_prob', default=0.1, type=float) parser.add_argument('--use_num_imgs', default=-1, type=int) parser.add_argument('--vis_mask_prob', default=0, type=float) parser.add_argument('--max_drop_worst_ratio', default=0, type=float) parser.add_argument('--drop_after', default=6, type=int) parser.add_argument( '--s2s_prob', default=1, type=float, help="Percentage of examples that are bi-uni-directional LM (seq2seq)." ) parser.add_argument( '--bi_prob', default=0, type=float, help="Percentage of examples that are bidirectional LM.") parser.add_argument( '--l2r_prob', default=0, type=float, help= "Percentage of examples that are unidirectional (left-to-right) LM.") parser.add_argument('--enable_butd', action='store_true', help='set to take in region features') parser.add_argument( '--region_bbox_file', default= 'coco_detection_vg_thresh0.2_feat_gvd_checkpoint_trainvaltest.h5', type=str) parser.add_argument( '--region_det_file_prefix', default= 'feat_cls_1000/coco_detection_vg_100dets_gvd_checkpoint_trainval', type=str) parser.add_argument('--tasks', default='img2txt', help='img2txt | vqa2') parser.add_argument('--relax_projection', action='store_true', help="Use different projection layers for tasks.") parser.add_argument('--scst', action='store_true', help='Self-critical sequence training') args = parser.parse_args() print('global_rank: {}, local rank: {}'.format(args.global_rank, args.local_rank)) args.max_seq_length = args.max_len_b + args.len_vis_input + 3 # +3 for 2x[SEP] and [CLS] args.mask_image_regions = (args.vis_mask_prob > 0 ) # whether to mask out image regions args.dist_url = args.dist_url.replace('[PT_OUTPUT_DIR]', args.output_dir) # arguments inspection assert (args.tasks in ('img2txt', 'vqa2')) assert args.enable_butd == True, 'only support region attn! featmap attn deprecated' assert ( not args.scst) or args.dataset == 'coco', 'scst support on coco only!' if args.scst: assert args.dataset == 'coco', 'scst support on coco only!' assert args.max_pred == 0 and args.mask_prob == 0, 'no mask for scst!' rl_crit = RewardCriterion() if args.enable_butd: assert (args.len_vis_input == 100) args.region_bbox_file = os.path.join(args.image_root, args.region_bbox_file) args.region_det_file_prefix = os.path.join( args.image_root, args.region_det_file_prefix) if args.dataset in ( 'cc', 'coco') and args.region_det_file_prefix != '' else '' # output config os.makedirs(args.output_dir, exist_ok=True) json.dump(args.__dict__, open(os.path.join(args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) logging.basicConfig( filename=os.path.join(args.output_dir, args.log_file), filemode='w', format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl', init_method=args.dist_url, world_size=args.world_size, rank=args.global_rank) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) # fix random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # plotting loss, optional if args.enable_visdom: import visdom vis = visdom.Visdom(port=args.visdom_port, env=args.output_dir) vis_window = {'iter': None, 'score': None} tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank)) if args.max_position_embeddings: tokenizer.max_len = args.max_position_embeddings data_tokenizer = WhitespaceTokenizer( ) if args.tokenized_input else tokenizer if args.do_train: bi_uni_pipeline = [ seq2seq_loader.Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_image_regions=args.mask_image_regions, mode="s2s", len_vis_input=args.len_vis_input, vis_mask_prob=args.vis_mask_prob, enable_butd=args.enable_butd, region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, local_rank=args.local_rank, load_vqa_ann=(args.tasks == 'vqa2')) ] bi_uni_pipeline.append( seq2seq_loader.Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_image_regions=args.mask_image_regions, mode="bi", len_vis_input=args.len_vis_input, vis_mask_prob=args.vis_mask_prob, enable_butd=args.enable_butd, region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, local_rank=args.local_rank, load_vqa_ann=(args.tasks == 'vqa2'))) bi_uni_pipeline.append( seq2seq_loader.Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_image_regions=args.mask_image_regions, mode="l2r", len_vis_input=args.len_vis_input, vis_mask_prob=args.vis_mask_prob, enable_butd=args.enable_butd, region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, local_rank=args.local_rank, load_vqa_ann=(args.tasks == 'vqa2'))) train_dataset = seq2seq_loader.Img2txtDataset( args.src_file, args.image_root, args.split, args.train_batch_size, data_tokenizer, args.max_seq_length, file_valid_jpgs=args.file_valid_jpgs, bi_uni_pipeline=bi_uni_pipeline, use_num_imgs=args.use_num_imgs, s2s_prob=args.s2s_prob, bi_prob=args.bi_prob, l2r_prob=args.l2r_prob, enable_butd=args.enable_butd, tasks=args.tasks) if args.world_size == 1: train_sampler = RandomSampler(train_dataset, replacement=False) else: train_sampler = DistributedSampler(train_dataset) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, sampler=train_sampler, num_workers=args.num_workers, collate_fn=batch_list_to_batch_tensors, pin_memory=True) # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) t_total = int( len(train_dataloader) * args.num_train_epochs * 1. / args.gradient_accumulation_steps) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model recover_step = _get_max_epoch_model(args.output_dir) cls_num_labels = 2 type_vocab_size = 6 if args.new_segment_ids else 2 relax_projection = 4 if args.relax_projection else 0 task_idx_proj = 3 if args.tasks == 'img2txt' else 0 mask_word_id, eos_word_ids, pad_word_ids = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[PAD]"]) # index in BERT vocab: 103, 102, 0 if (recover_step is None) and (args.model_recover_path is None): # if _state_dict == {}, the parameters are randomly initialized # if _state_dict == None, the parameters are initialized with bert-init assert args.scst == False, 'must init from maximum likelihood training' _state_dict = {} if args.from_scratch else None model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=_state_dict, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, relax_projection=relax_projection, config_path=args.config_path, task_idx=task_idx_proj, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank), drop_prob=args.drop_prob, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input, tasks=args.tasks) global_step = 0 else: if recover_step: logger.info("***** Recover model: %d *****", recover_step) model_recover = torch.load( os.path.join(args.output_dir, "model.{0}.bin".format(recover_step))) # recover_step == number of epochs global_step = math.floor(recover_step * t_total * 1. / args.num_train_epochs) elif args.model_recover_path: logger.info("***** Recover model: %s *****", args.model_recover_path) model_recover = torch.load(args.model_recover_path) global_step = 0 if not args.scst: model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, relax_projection=relax_projection, config_path=args.config_path, task_idx=task_idx_proj, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank), drop_prob=args.drop_prob, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input, tasks=args.tasks) else: model = BertForSeq2SeqDecoder.from_pretrained( args.bert_model, max_position_embeddings=args.max_position_embeddings, config_path=args.config_path, state_dict=model_recover, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, task_idx=task_idx_proj, mask_word_id=mask_word_id, search_beam_size=1, eos_id=eos_word_ids, mode='s2s', enable_butd=args.enable_butd, len_vis_input=args.len_vis_input) del model_recover torch.cuda.empty_cache() # deprecated # from vlp.resnet import resnet # cnn = resnet(args.resnet_model, _num_layers=101, _fixed_block=4, pretrained=True) # no finetuning if args.fp16: model.half() # cnn.half() if args.fp32_embedding: model.bert.embeddings.word_embeddings.float() model.bert.embeddings.position_embeddings.float() model.bert.embeddings.token_type_embeddings.float() model.to(device) # cnn.to(device) if args.local_rank != -1: try: # from apex.parallel import DistributedDataParallel as DDP from torch.nn.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # cnn = DDP(cnn) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelImbalance(model) # cnn = DataParallelImbalance(cnn) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from vlp_pretrained_bert.optimization_fp16 import FP16_Optimizer_State from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer_State(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer_State(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, schedule=args.sche_mode, t_total=t_total) if recover_step: logger.info("***** Recover optimizer: %d *****", recover_step) optim_recover = torch.load( os.path.join(args.output_dir, "optim.{0}.bin".format(recover_step))) if hasattr(optim_recover, 'state_dict'): optim_recover = optim_recover.state_dict() optimizer.load_state_dict(optim_recover) if args.loss_scale == 0: logger.info("***** Recover optimizer: dynamic_loss_scale *****") optimizer.dynamic_loss_scale = True logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.do_train: logger.info("***** Running training *****") logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) logger.info(" Loader length = %d", len(train_dataloader)) model.train() if recover_step: start_epoch = recover_step + 1 else: start_epoch = 1 for i_epoch in trange(start_epoch, args.num_train_epochs + 1, desc="Epoch"): if args.local_rank >= 0: train_sampler.set_epoch(i_epoch - 1) iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)') nbatches = len(train_dataloader) train_loss = [] pretext_loss = [] vqa2_loss = [] scst_reward = [] for step, batch in enumerate(iter_bar): batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, img, vis_masked_pos, vis_pe, ans_labels = batch if args.fp16: img = img.half() vis_pe = vis_pe.half() if args.enable_butd: conv_feats = img.data # Bx100x2048 vis_pe = vis_pe.data else: conv_feats, _ = cnn(img.data) # Bx2048x7x7 conv_feats = conv_feats.view(conv_feats.size(0), conv_feats.size(1), -1).permute(0, 2, 1).contiguous() if not args.scst: loss_tuple = model( conv_feats, vis_pe, input_ids, segment_ids, input_mask, lm_label_ids, ans_labels, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, vis_masked_pos=vis_masked_pos, mask_image_regions=args.mask_image_regions, drop_worst_ratio=args.max_drop_worst_ratio if i_epoch > args.drop_after else 0) mean_reward = loss_tuple[0].new(1).fill_(0) else: # scst training model.eval() position_ids = torch.arange( input_ids.size(1), dtype=input_ids.dtype, device=input_ids.device).unsqueeze(0).expand_as( input_ids) input_dummy = input_ids[:, :args.len_vis_input + 2] # +2 for [CLS] and [SEP] greedy_res = input_ids.new( input_ids.size(0), input_ids.size(1) - args.len_vis_input - 2).fill_(0) gen_result = input_ids.new( input_ids.size(0), input_ids.size(1) - args.len_vis_input - 2).fill_(0) with torch.no_grad(): greedy_res_raw, _ = model(conv_feats, vis_pe, input_dummy, segment_ids, position_ids, input_mask, task_idx=task_idx, sample_mode='greedy') for b in range(greedy_res_raw.size(0)): for idx in range(greedy_res_raw.size(1)): if greedy_res_raw[b][idx] not in [ eos_word_ids, pad_word_ids ]: greedy_res[b][idx] = greedy_res_raw[b][idx] else: if greedy_res_raw[b][idx] == eos_word_ids: greedy_res[b][idx] = eos_word_ids break model.train() gen_result_raw, sample_logprobs = model( conv_feats, vis_pe, input_dummy, segment_ids, position_ids, input_mask, task_idx=task_idx, sample_mode='sample') for b in range(gen_result_raw.size(0)): for idx in range(gen_result_raw.size(1)): if gen_result_raw[b][idx] not in [ eos_word_ids, pad_word_ids ]: gen_result[b][idx] = gen_result_raw[b][idx] else: if gen_result_raw[b][idx] == eos_word_ids: gen_result[b][idx] = eos_word_ids break gt_ids = input_ids[:, args.len_vis_input + 2:] reward = get_self_critical_reward(greedy_res, gt_ids, gen_result, gt_ids.size(0)) reward = torch.from_numpy(reward).float().to( gen_result.device) mean_reward = reward.mean() loss = rl_crit(sample_logprobs, gen_result.data, reward) loss_tuple = [ loss, loss.new(1).fill_(0.), loss.new(1).fill_(0.) ] # disable pretext_loss_deprecated for now masked_lm_loss, pretext_loss_deprecated, ans_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. For dist, this is done through gradient addition. masked_lm_loss = masked_lm_loss.mean() pretext_loss_deprecated = pretext_loss_deprecated.mean() ans_loss = ans_loss.mean() loss = masked_lm_loss + pretext_loss_deprecated + ans_loss # logging for each step (i.e., before normalization by args.gradient_accumulation_steps) iter_bar.set_description('Iter (loss=%5.3f)' % loss.item()) train_loss.append(loss.item()) pretext_loss.append(pretext_loss_deprecated.item()) vqa2_loss.append(ans_loss.item()) scst_reward.append(mean_reward.item()) if step % 100 == 0: logger.info( "Epoch {}, Iter {}, Loss {:.2f}, Pretext {:.2f}, VQA2 {:.2f}, Mean R {:.3f}\n" .format(i_epoch, step, np.mean(train_loss), np.mean(pretext_loss), np.mean(vqa2_loss), np.mean(scst_reward))) if args.enable_visdom: if vis_window['iter'] is None: vis_window['iter'] = vis.line( X=np.tile( np.arange((i_epoch - 1) * nbatches + step, (i_epoch - 1) * nbatches + step + 1), (1, 1)).T, Y=np.column_stack( (np.asarray([np.mean(train_loss)]), )), opts=dict(title='Training Loss', xlabel='Training Iteration', ylabel='Loss', legend=['total'])) else: vis.line(X=np.tile( np.arange((i_epoch - 1) * nbatches + step, (i_epoch - 1) * nbatches + step + 1), (1, 1)).T, Y=np.column_stack( (np.asarray([np.mean(train_loss)]), )), opts=dict(title='Training Loss', xlabel='Training Iteration', ylabel='Loss', legend=['total']), win=vis_window['iter'], update='append') # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * \ warmup_linear(global_step/t_total, args.warmup_proportion) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info( "** ** * Saving fine-tuned model and optimizer ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "model.{0}.bin".format(i_epoch)) output_optim_file = os.path.join(args.output_dir, "optim.{0}.bin".format(i_epoch)) if args.global_rank in ( -1, 0): # save model if the first device or no dist torch.save( copy.deepcopy(model_to_save).cpu().state_dict(), output_model_file) # torch.save(optimizer.state_dict(), output_optim_file) # disable for now, need to sanitize state and ship everthing back to cpu logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.world_size > 1: torch.distributed.barrier()
def main(): args = parse_args() args.distributed, args.world_size = init_distributed(args.local_rank) if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}.{}".format(config['timestamp'],args.local_rank) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() # more like load trigger timmer now mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL, value=args.valid_negative) # The default of np.random.choice is replace=True, so does pytorch random_() mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_EVAL_NEG_GEN) # sync worker before timing. if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() #=========================================================================== #== The clock starts on loading the preprocessed data. ===================== #=========================================================================== mlperf_log.ncf_print(key=mlperf_log.RUN_START) run_start_time = time.time() # load not converted data, just seperate one for test train_ratings = torch.load(args.data+'/train_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank))) test_ratings = torch.load(args.data+'/test_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank))) # get input data # get dims nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item()+1 nb_items = nb_maxs[1].item()+1 train_users = train_ratings[:,0] train_items = train_ratings[:,1] del nb_maxs, train_ratings mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE, value=len(train_users)) # produce things not change between epoch # mask for filtering duplicates with real sample # note: test data is removed before create mask, same as reference mat = torch.cuda.ByteTensor(nb_users, nb_items).fill_(1) mat[train_users, train_items] = 0 # create label train_label = torch.ones_like(train_users, dtype=torch.float32) neg_label = torch.zeros_like(train_label, dtype=torch.float32) neg_label = neg_label.repeat(args.negative_samples) train_label = torch.cat((train_label,neg_label)) del neg_label if args.fp16: train_label = train_label.half() # produce validation negative sample on GPU all_test_users = test_ratings.shape[0] test_users = test_ratings[:,0] test_pos = test_ratings[:,1].reshape(-1,1) test_negs = generate_neg(test_users, mat, nb_items, args.valid_negative, True)[1] # create items with real sample at last position test_users = test_users.reshape(-1,1).repeat(1,1+args.valid_negative) test_items = torch.cat((test_negs.reshape(-1,args.valid_negative), test_pos), dim=1) del test_ratings, test_negs # generate dup mask and real indice for exact same behavior on duplication compare to reference # here we need a sort that is stable(keep order of duplicates) # this is a version works on integer sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2] sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) #[1.75,1.25,1.0,2.5] indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3] stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2] # produce -1 mask dup_mask = (sorted_items[:,0:-1] == sorted_items[:,1:]) dup_mask = torch.cat((torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask),dim=1) dup_mask = torch.gather(dup_mask,1,stable_indices.sort()[1]) # produce real sample indices to later check in topk sorted_items, indices = (test_items != test_pos).sort() sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) indices_order = torch.sort(sum_item_indices)[1] stable_indices = torch.gather(indices, 1, indices_order) real_indices = stable_indices[:,0] del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos if args.distributed: test_users = torch.chunk(test_users, args.world_size)[args.local_rank] test_items = torch.chunk(test_items, args.world_size)[args.local_rank] dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank] real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank] # make pytorch memory behavior more consistent later torch.cuda.empty_cache() mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) mlperf_log.ncf_print(key=mlperf_log.INPUT_ORDER) # we shuffled later with randperm print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time()-run_start_time, nb_users, nb_items, len(train_users), nb_users)) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) if args.fp16: model = model.half() print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph if args.fp16: fp_optimizer = Fp16Optimizer(model, args.loss_scale) params = fp_optimizer.fp32_params else: params = model.parameters() #optimizer = torch.optim.Adam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) # optimizer = AdamOpt(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) optimizer = FusedAdam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, eps_inside_sqrt=False) criterion = nn.BCEWithLogitsLoss(reduction = 'none') # use torch.mean() with dim later to avoid copy to host mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate) mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam") mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=args.beta1) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=args.beta2) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=args.eps) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE) if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.distributed: model = DDP(model) local_batch = args.batch_size // int(os.environ['WORLD_SIZE']) else: local_batch = args.batch_size traced_criterion = torch.jit.trace(criterion.forward, (torch.rand(local_batch,1),torch.rand(local_batch,1))) # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG test_x = test_users.view(-1).split(args.valid_batch_size) test_y = test_items.view(-1).split(args.valid_batch_size) hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1), num_user=all_test_users, distributed=args.distributed) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}' .format(K=args.topk, hit_rate=hr, ndcg=ndcg)) success = False mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=args.negative_samples) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() # prepare data for epoch neg_users, neg_items = generate_neg(train_users, mat, nb_items, args.negative_samples) epoch_users = torch.cat((train_users,neg_users)) epoch_items = torch.cat((train_items,neg_items)) del neg_users, neg_items # shuffle prepared data and split into batches epoch_indices = torch.randperm(len(epoch_users), device='cuda:{}'.format(args.local_rank)) epoch_users = epoch_users[epoch_indices] epoch_items = epoch_items[epoch_indices] epoch_label = train_label[epoch_indices] if args.distributed: epoch_users = torch.chunk(epoch_users, args.world_size)[args.local_rank] epoch_items = torch.chunk(epoch_items, args.world_size)[args.local_rank] epoch_label = torch.chunk(epoch_label, args.world_size)[args.local_rank] epoch_users_list = epoch_users.split(local_batch) epoch_items_list = epoch_items.split(local_batch) epoch_label_list = epoch_label.split(local_batch) # only print progress bar on rank 0 num_batches = (len(epoch_indices) + args.batch_size - 1) // args.batch_size if args.local_rank == 0: qbar = tqdm.tqdm(range(num_batches)) else: qbar = range(num_batches) # handle extremely rare case where last batch size < number of worker if len(epoch_users_list) < num_batches: print("epoch_size % batch_size < number of worker!") exit(1) for i in qbar: # selecting input from prepared data user = epoch_users_list[i] item = epoch_items_list[i] label = epoch_label_list[i].view(-1,1) for p in model.parameters(): p.grad = None outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) if args.fp16: fp_optimizer.step(loss, optimizer) else: loss.backward() optimizer.step() del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label train_time = time.time() - begin begin = time.time() mlperf_log.ncf_print(key=mlperf_log.EVAL_START) hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1), num_user=all_test_users, output=valid_results_file, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin print('Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}' .format(epoch=epoch, K=args.topk, hit_rate=hr, ndcg=ndcg, train_time=train_time, val_time=val_time)) mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={"epoch": epoch, "value": hr}) mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold) mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP) if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) success = True break mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success}) run_stop_time = time.time() mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL) # easy way of tracking mlperf score if success: print("mlperf_score", run_stop_time - run_start_time)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument("--trained_model_dir", default="", type=str, help="Where is the fine-tuned BERT model?") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): #raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) print("WARNING: Output directory already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: train_dataset = BERTDataset(args.data_dir, tokenizer, seq_len=args.max_seq_length) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if args.trained_model_dir: if os.path.exists(os.path.join(args.output_dir, WEIGHTS_NAME)): previous_state_dict = torch.load( os.path.join(args.output_dir, WEIGHTS_NAME)) else: from collections import OrderedDict previous_state_dict = OrderedDict() distant_state_dict = torch.load( os.path.join(args.trained_model_dir, WEIGHTS_NAME)) previous_state_dict.update( distant_state_dict ) # note that the final layers of previous model and distant model must have different attribute names! model = MyBertForMaskedLM.from_pretrained( args.trained_model_dir, state_dict=previous_state_dict) else: model = MyBertForMaskedLM.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string())
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") parser.add_argument("--trained_model", default=None, type=str, required=True, help="Trained Model for dev set loss calculation") ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", default=False, action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory () already exists and is not empty.") # os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model # model = BertForQuestionAnswering.from_pretrained(args.bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) # Load a trained model that you have fine-tuned model_state_dict = torch.load(args.trained_model, map_location=lambda storage, loc: storage) model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() loss_dev = {} ep = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): batch_count = 0 loss_dev["Epoch: " + str(ep)] = 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) loss_dev["Epoch: " + str(ep)] = loss_dev["Epoch: " + str(ep)] +loss.item() # with open(str(args.output_dir) + "Loss_Train.txt", "wb") as fp: #Pickling # pickle.dump(loss_dev, fp) # if n_gpu > 1: # loss = loss.mean() # mean() to average on multi-gpu. # if args.gradient_accumulation_steps > 1: # loss = loss / args.gradient_accumulation_steps # if args.fp16: # optimizer.backward(loss) # else: # loss.backward() # if (step + 1) % args.gradient_accumulation_steps == 0: # # modify learning rate with special warm up BERT uses # lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) # for param_group in optimizer.param_groups: # param_group['lr'] = lr_this_step # optimizer.step() # optimizer.zero_grad() # global_step += 1 batch_count += 1 # # Save a trained model # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # output_model_file = os.path.join(args.output_dir, "pytorch_model_epoch" + str(ep) + ".bin") # torch.save(model_to_save.state_dict(), output_model_file) #ep = ep+1 loss_dev["Epoch: " + str(ep)] =loss_dev["Epoch: " + str(ep)]/batch_count with open(str(args.output_dir) + str(args.trained_model[17:]) + "_Loss_Eval_Dev_" + str(args.trained_model[-5]) + ".pickle", "wb") as fp: #Pickling pickle.dump(loss_dev, fp) # # Load a trained model that you have fine-tuned # model_state_dict = torch.load(output_model_file) # model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) # model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--model_dir", default=None, type=str, required=True, help="") parser.add_argument("--my_config", default=None, type=str, required=True) parser.add_argument("--feature_path", default=None, type=str, required=True) parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--train_pattern", default=None, type=str, help="training data path.") parser.add_argument("--valid_pattern", default=None, type=str, help="validation data path.") parser.add_argument("--test_pattern", default=None, type=str, help="test data path.") parser.add_argument("--max_seq_length", default=512, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--report_steps", default=50, type=int, help="report steps when training.") parser.add_argument("--train_batch_size", default=4, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=-1, type=int) parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--train_size',type=int,default=10000,help="Use how many train data") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--frame_name',type=str,default='elgeish/cs224n-squad2.0-albert-large-v2') parser.add_argument('--DataName',type=str,default="SST") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") args = parser.parse_args() #print(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') n_gpu = 1 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_pattern: raise ValueError( "If `do_train` is True, then `train_pattern` must be specified.") if args.do_predict: if not args.test_pattern: raise ValueError( "If `do_predict` is True, then `test_pattern` must be specified.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Prepare model my_config = Config(args.my_config) my_config.num_edge_types = sum(EdgePosition.max_edge_types) # my_config.forward_edges = [EdgeType.TOKEN_TO_SENTENCE, # EdgeType.SENTENCE_TO_PARAGRAPH, # EdgeType.PARAGRAPH_TO_DOCUMENT] #print(my_config) if args.do_train: pretrained_config_file = os.path.join(args.model_dir, CONFIG_NAME) bert_config = BertConfig(pretrained_config_file) pretrained_model_file = os.path.join(args.model_dir, WEIGHTS_NAME) model = NqModel(bert_config=bert_config, my_config=my_config) #model_dict = model.state_dict() #pretrained_model_dict = torch.load(pretrained_model_file, map_location=lambda storage, loc: storage) #pretrained_model_dict = {k: v for k, v in pretrained_model_dict.items() if k in model_dict.keys()} #model_dict.update(pretrained_model_dict) #model.load_state_dict(model_dict) else: pretrained_config_file = os.path.join(args.model_dir, CONFIG_NAME) bert_config = BertConfig(pretrained_config_file) model = NqModel(bert_config=bert_config, my_config=my_config) pretrained_model_file = os.path.join(args.model_dir, WEIGHTS_NAME) model.load_state_dict(torch.load(pretrained_model_file)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) num_train_features = None num_train_optimization_steps = None #train_dataset = None #train_features = None # output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) # model.load_state_dict(torch.load(output_model_file)) if args.do_train: num_train_features = 0 for data_path in glob(args.train_pattern): train_dataset = NqDataset(args, data_path, is_training=True) train_features = train_dataset.features num_train_features += len(train_dataset.features) print(num_train_features,args.train_batch_size,args.gradient_accumulation_steps) num_train_optimization_steps = int( (num_train_features / args.train_batch_size) / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare optimizer param_optimizer = list(model.named_parameters()) # print(param_optimizer) #print([i for i,j in model.named_parameters()]) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer ] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.00} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: if args.warmup_steps > 0: args.warmup_proportion = min(args.warmup_proportion, args.warmup_steps / num_train_optimization_steps) # optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=int(args.warmup_proportion * num_train_optimization_steps) if args.warmup_proportion > 0 else args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 last_acc = 84.3704066634003 if args.do_train: logger.info("***** Running training *****") logger.info(" Num split examples = %d", num_train_features) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) tr_loss, report_loss = 0.0, 0.0 nb_tr_examples = 0 model.zero_grad() for _ in trange(int(args.num_train_epochs), desc="Epoch"): logging.info("Loggin TEST!") for data_path in glob(args.train_pattern): #logging.info("Reading data from {}.".format(data_path)) model.train() train_dataset = NqDataset(args, data_path, is_training=True) train_features = train_dataset.features #logging.info("Data Load Done!") if args.local_rank == -1: train_sampler = RandomSampler(train_features) else: train_sampler = DistributedSampler(train_features) train_dataloader = DataLoader(train_features, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=batcher(device, is_training=True), num_workers=0) train_features = train_dataset.features logging.info("Data ready {} ".format(len(train_features))) for step, batch in enumerate(train_dataloader): loss = model(batch.input_ids, batch.input_mask, batch.segment_ids, batch.st_mask, (batch.edges_src, batch.edges_tgt, batch.edges_type, batch.edges_pos),batch.label) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.local_rank != -1: loss = loss + 0 * sum([x.sum() for x in model.parameters()]) if args.fp16: optimizer.backward(loss) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 tr_loss += loss.item() nb_tr_examples += 1 if (step + 1) % args.gradient_accumulation_steps == 0 and ( global_step + 1) % args.report_steps == 0 and ( args.local_rank == -1 or torch.distributed.get_rank() == 0): lr_this_step = get_lr(optimizer) logging.info("Epoch={} iter={} lr={:.12f} train_ave_loss={:.6f} .".format( # _, global_step, lr_this_step, tr_loss / nb_tr_examples)) _, global_step, lr_this_step, (tr_loss - report_loss) / args.report_steps)) report_loss = tr_loss model.eval() model.zero_grad() model.ACC = model.ALL = 0 train_dataset = NqDataset(args, "test.json", is_training=True) train_features = train_dataset.features #logging.info("Data Load Done!") train_sampler = RandomSampler(train_features) train_dataloader = DataLoader(train_features, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=batcher(device, is_training=True), num_workers=0) train_features = train_dataset.features logging.info("Data ready {} ".format(len(train_features))) tgobal_step = 0 ttr_loss = 0 optimizer.zero_grad() logging.info("***** Running evalating *****") with torch.no_grad(): for step, batch in enumerate(train_dataloader): tgobal_step+=1 loss = model(batch.input_ids, batch.input_mask, batch.segment_ids, batch.st_mask, (batch.edges_src, batch.edges_tgt, batch.edges_type, batch.edges_pos),batch.label) ttr_loss+=loss.item() logging.info("ACC:{}% LOSS:{}".format(model.ACC/model.ALL*100,ttr_loss/tgobal_step)) model.zero_grad() optimizer.zero_grad() if model.ACC/model.ALL*100>last_acc: logging.info("Save Model") last_acc = model.ACC/model.ALL*100 model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # if args.server_ip and args.server_port: # # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # import ptvsd # print("Waiting for debugger attach") # ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) # ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() #[0,1] num_labels = len(label_list) train_examples = None num_train_optimization_steps = None if args.do_train: # train_examples = processor.get_train_examples_wenpeng('/home/wyin3/Datasets/glue_data/RTE/train.tsv') train_examples, _, _ = processor.get_examples_SF_wenpeng( '/home/wyin3/Datasets/MNLI-SNLI-SciTail-RTE-SICK/BBN.SF.train.txt') num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) # model = BertForSequenceClassification.from_pretrained(args.bert_model, # cache_dir=cache_dir, # num_labels=num_labels) # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) pretrain_model_dir = '/home/wyin3/Datasets/fine_tune_Bert_stored/FineTuneOnSciTail' #FineTuneOnMNLI' model = BertForSequenceClassification.from_pretrained( pretrain_model_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case) if args.fp16: model.half() model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # optimizer = optimizer_wenpeng.Adagrad(optimizer_grouped_parameters, lr=1e-3) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_acc = 0.0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) eval_examples, eval_label_list, eval_coord_list = processor.get_examples_SF_wenpeng( '/home/wyin3/Datasets/MNLI-SNLI-SciTail-RTE-SICK/BBN.SF.test.txt') eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) # print('train all_label_ids:', all_label_ids) # exit(0) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) iter_co = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # print('train label_ids:', label_ids.view(-1)) # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 iter_co += 1 if iter_co % 100 == 0: ''' start evaluate on test set after this epoch ''' model.eval() logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct( logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) mean_f1, weighted_f1 = evaluate_SF(preds, eval_label_list, eval_coord_list) # result = compute_metrics('F1', preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None test_acc = mean_f1 #result.get("f1") if test_acc > max_test_acc: max_test_acc = test_acc print('\ntest F1 & weighted-F1:', test_acc, weighted_f1, ' max_test_f1:', max_test_acc, '\n')
def create_optimizer(net, name, learning_rate, weight_decay, momentum=0, apex_opt_level=None, optimizer_state=None, device=None, no_bn_wd=False, local_rank=None, sync_bn=False): net.float() if local_rank is not None: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs print('torch.distributed.init_process_group') torch.distributed.init_process_group(backend='nccl') device = torch.device('cuda', local_rank) else: device = choose_device(device) if sync_bn: import apex print("using synced BN") net = apex.parallel.convert_syncbn_model(net) print('use', device) net = net.to(device) # optimizer parameters = [p for p in net.parameters() if p.requires_grad] if no_bn_wd: parameters = bnwd_optim_params(net, parameters) if name == 'sgd': optimizer = torch.optim.SGD(parameters, lr=learning_rate, momentum=momentum, weight_decay=weight_decay) elif name == 'adam': optimizer = torch.optim.Adam(parameters, lr=learning_rate, weight_decay=weight_decay) elif name == 'adamw': from .optim.adam import Adam optimizer = Adam(parameters, lr=learning_rate, weight_decay=weight_decay) elif name == 'fused_adam': from apex.optimizers import FusedAdam optimizer = FusedAdam(parameters, lr=learning_rate, weight_decay=weight_decay) elif name == 'adabound': from trainer.optim.adabound import AdaBound optimizer = AdaBound(parameters, lr=learning_rate, weight_decay=weight_decay) elif name == 'adaboundw': from trainer.optim.adabound import AdaBoundW optimizer = AdaBoundW(parameters, lr=learning_rate, weight_decay=weight_decay) else: raise NotImplementedError(name) if apex_opt_level: from apex import amp net, optimizer = amp.initialize(net, optimizer, opt_level=apex_opt_level) def backward(loss): with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.backward = backward else: optimizer.backward = lambda loss: loss.backward() if optimizer_state: # if use_fp16 and 'optimizer_state_dict' not in optimizer_state: # # resume FP16_Optimizer.optimizer only # optimizer.optimizer.load_state_dict(optimizer_state) # elif not use_fp16 and 'optimizer_state_dict' in optimizer_state: # # resume optimizer from FP16_Optimizer.optimizer # optimizer.load_state_dict(optimizer_state['optimizer_state_dict']) # else: optimizer.load_state_dict(optimizer_state) if device.type == 'cuda': torch.backends.cudnn.benchmark = True if local_rank is not None: if apex_opt_level: import apex net = apex.parallel.DistributedDataParallel( net, delay_allreduce=True) else: net = torch.nn.parallel.DistributedDataParallel( net, device_ids=[local_rank], output_device=local_rank) else: net = torch.nn.DataParallel(net) return net, optimizer
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) # Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--deepscale', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument("--model_file", type=str, default="0", help="Path to the Pretrained BERT Encoder File.") parser.add_argument('--random', default=False, action='store_true', help="Whether to fientune for random initialization") parser.add_argument('--focal', default=False, action='store_true', help="Whether to use Focal Loss for finetuning.") parser.add_argument('--gamma', type=float, default=0.5, help="Gamma parameter to be used in focal loss.") parser.add_argument('--deepspeed_sparse_attention', default=False, action='store_true', help='Use DeepSpeed sparse self attention.') parser.add_argument('--deepspeed_transformer_kernel', default=False, action='store_true', help='Use DeepSpeed transformer kernel to accelerate.') parser.add_argument( '--progressive_layer_drop', default=False, action='store_true', help="Whether to enable progressive layer dropping or not") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps args.seed = random.randint(1, 1000) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if (torch.distributed.get_rank() == 0): # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError( # "Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) torch.distributed.barrier() task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) bert_base_model_config = { "vocab_size_or_config_json_file": 119547, "hidden_size": 768, "num_hidden_layers": 12, "num_attention_heads": 12, "intermediate_size": 3072, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "attention_probs_dropout_prob": 0.1, "max_position_embeddings": 512, "type_vocab_size": 2, "initializer_range": 0.02 } if args.progressive_layer_drop: print("BertBaseConfigPreLnLayerDrop") from nvidia.modelingpreln_layerdrop import BertForSequenceClassification, BertConfig else: from nvidia.modelingpreln import BertForSequenceClassification, BertConfig bert_config = BertConfig(**bert_base_model_config) bert_config.vocab_size = len(tokenizer.vocab) # Padding for divisibility by 8 if bert_config.vocab_size % 8 != 0: bert_config.vocab_size += 8 - (bert_config.vocab_size % 8) model = BertForSequenceClassification(args, bert_config, num_labels=num_labels) if args.model_file is not "0": logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}") # bert_state_dict = torch.load(args.model_file) # model.bert.load_state_dict(bert_state_dict) checkpoint_state_dict = torch.load(args.model_file, map_location=torch.device("cpu")) if 'module' in checkpoint_state_dict: logger.info('Loading DeepSpeed v2.0 style checkpoint') model.load_state_dict(checkpoint_state_dict['module'], strict=False) elif 'model_state_dict' in checkpoint_state_dict: model.load_state_dict(checkpoint_state_dict['model_state_dict'], strict=False) else: raise ValueError("Unable to find model state in checkpoint") logger.info(f"Pretrained Bert Encoder Loaded from: {args.model_file}") if args.random: logger.info("USING RANDOM INITIALISATION FOR FINETUNING") model.apply(model.init_bert_weights) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: if args.deepscale: print("Enabling DeepScale") from deepscale.distributed_apex import DistributedDataParallel as DDP else: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] if args.deepspeed_transformer_kernel: no_decay = no_decay + [ 'attn_nw', 'attn_nb', 'norm_w', 'norm_b', 'attn_qkvb', 'attn_ob', 'inter_b', 'output_b' ] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": if args.fp16: all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.half) else: all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": if args.focal: loss_fct = FocalLoss(class_num=num_labels, gamma=args.gamma) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.deepscale and args.local_rank != -1: model.disable_need_reduction() if (step + 1) % args.gradient_accumulation_steps == 0: model.enable_need_reduction() if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * \ warmup_linear( global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": if args.focal: loss_fct = FocalLoss(class_num=num_labels, gamma=args.gamma) else: loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() print(logits.type()) print(label_ids.type()) if task_name == "sts-b": tmp_eval_loss = loss_fct(logits.float().view(-1), label_ids.view(-1)) else: tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}{}) already exists and is not empty.". format(args.output_dir, '-MM')) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) if args.focal: loss_fct = FocalLoss(class_num=num_labels, gamma=args.gamma) else: loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): opt = parse_args() opt.sever_name = gethostname() # --- CUDA setting --- os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpu_ids device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # --- Random Seed setting --- if opt.random_seed is None: opt.random_seed = random.randint(1, 10000) random.seed(opt.random_seed) np.random.seed(opt.random_seed) torch.manual_seed(opt.random_seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(opt.random_seed) cudnn.deterministic = True cudnn.benchmark = opt.cudnn_benchmark # --- PATH setting --- save_result_dir = Path(__file__).parent / "results" / opt.experiment_name opt.save_model_dir = str(save_result_dir / "trained_models") opt.save_log_path = str(save_result_dir / "train.log") mkdirs(opt.save_model_dir) # --- Prepare DataLoader --- train_loader, valid_loader = get_train_val_loader(opt) opt.src_vocab_size = train_loader.dataset.src_vocab_size opt.tgt_vocab_size = train_loader.dataset.tgt_vocab_size # --- Prepare Model --- model = MultimodalTransformer( src_vocab_size=opt.src_vocab_size, tgt_vocab_size=opt.tgt_vocab_size, max_position_num=opt.max_position_num, d_model=opt.d_model, head_num=opt.head_num, d_k=opt.d_k, d_v=opt.d_v, d_inner=opt.d_inner, layer_num=opt.layer_num, dropout=opt.dropout, cnn_fine_tuning=opt.cnn_fine_tuning, shared_embedding=opt.shared_embedding, share_dec_input_output_embed=opt.share_dec_input_output_embed, init_weight=opt.init_weight, fused_layer_norm=opt.use_fused, ).to(device) # --- Prepare optimizer and scaler --- if opt.use_fused: from apex.optimizers import FusedAdam as Adam else: from torch.optim import Adam optimizer = Adam(filter(lambda x: x.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09, weight_decay=opt.weight_decay) scaler = GradScaler(init_scale=65536.0, enabled=opt.use_amp) # --- Restart setting --- start_cnt = 1 steps_cnt = 0 if opt.adapt_prop_MNMT is not None: ex_name, epoch_cnt = opt.adapt_prop_MNMT.split(',') saved_path = f"{pardir}/results/{ex_name}/trained_models/epoch_{epoch_cnt}.pth" saved_dict = torch.load(saved_path, map_location=lambda storage, loc: storage) init_dir, init_epoch = saved_dict["settings"].MNMT.split(',') init_path = f"{pardir}/MNMT/results/{init_dir}/trained_models/epoch_{init_epoch}.pth" init_data = torch.load(init_path, map_location=lambda storage, loc: storage) check_arguments(init_data["settings"], opt) model.load_state_dict(saved_dict["models"]["MNMT"]) print(f"[Info]Loading complete ({saved_path})") elif opt.adapt_init_MNMT is not None: ex_name, epoch_cnt = opt.adapt_init_MNMT.split(',') saved_path = f"{Path(__file__).parent}/results/{ex_name}/trained_models/epoch_{epoch_cnt}.pth" saved_dict = torch.load(saved_path, map_location=lambda storage, loc: storage) check_arguments(saved_dict["settings"], opt) model.load_state_dict(saved_dict["model"]) print(f"[Info]Loading complete ({saved_path})") if opt.restart is not None: start_cnt = opt.restart + 1 if opt.restart < 500: model_name = f"epoch_{opt.restart}.pth" else: model_name = f"step_{opt.restart}.pth" saved_path = f"{opt.save_model_dir}/{model_name}" saved_dict = torch.load(saved_path, map_location=lambda storage, loc: storage) check_arguments(saved_dict["settings"], opt) model.load_state_dict(saved_dict["model"]) optimizer.load_state_dict(saved_dict["optimizer"]) scaler.load_state_dict(saved_dict["scaler"]) steps_cnt = saved_dict["steps_cnt"] print(f"[Info]Loading complete ({saved_path})") scheduler = Scheduler( optimizer=optimizer, init_lr=0., end_lr=opt.end_lr, warmup_steps=opt.warmup_steps, current_steps=steps_cnt, ) # --- DataParallel setting --- gpus = [i for i in range(len(opt.gpu_ids.split(',')))] if len(gpus) > 1: model = nn.DataParallel(model, device_ids=gpus) # --- Prepare trainer and validator --- if valid_loader is not None: validator = ScoreCalculator( model=model, data_loader=valid_loader, references=valid_loader.dataset.tgt_insts, bpe=opt.bpe, cp_avg_num=opt.check_point_average, ) else: validator = None trainer = MNMTTrainer( model=model, train_loader=train_loader, optimizer=optimizer, scaler=scaler, scheduler=scheduler, opt=opt, validator=validator, ) # -- Train -- if opt.max_epoch is not None: trainer.train_by_epoch(start_cnt) else: trainer.train_by_step(start_cnt)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=8.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--train_limit', type=int, default=10000000, help="Train Samples Limit") args = parser.parse_args() processors = { "obqa": OBQAProcessor, "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "snli": SnliProcessor, "qnli": QnliProcessor, "bow" : BoWProcessor } num_labels_task = { "obqa": 2, "cola": 2, "mnli": 3, "mrpc": 2, "snli": 3, "qnli": 2, "bow" : 2 } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir,args.train_limit) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) test_examples = processor.get_test_examples(args.data_dir) dev_examples = processor.get_dev_examples(args.data_dir) # Prepare model BertForCNN3 # model = BertForSequenceClassification.from_pretrained(args.bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), # num_labels = num_labels) # model = BertForCNN.from_pretrained(args.bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), # num_labels = num_labels) # model = BertForCNN3.from_pretrained(args.bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), # num_labels = num_labels) model = BertForCNN4.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels = num_labels) # BertForCNN if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 nb_tr_steps = 0 tr_loss = 0 result = {} if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, tokenizer) dev_features = convert_examples_to_features( dev_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) train_dataloader = get_dataloader(train_features,args.train_batch_size) test_dataloader = get_dataloader(test_features,args.train_batch_size) dev_dataloader = get_dataloader(dev_features,args.train_batch_size) ep = 0 output_model_file = "dummy" loss_fct = CrossEntropyLoss() max_acc = -1 for _ in trange(int(args.num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 ep += 1 tqi = tqdm(train_dataloader, desc="Iteration") acc = 0 for step, batch in enumerate(tqi): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask) loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_accuracy = accuracy(logits, label_ids) acc += tmp_accuracy if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/(t_total), args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 tqi.set_description("Loss:"+str(tr_loss/nb_tr_steps)+",Acc:"+str(acc/nb_tr_examples)) model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin." + str(ep)) torch.save(model_to_save.state_dict(), output_model_file) model.eval() names = ["Val","Test"] for index,eval_dataloader in enumerate([dev_dataloader,test_dataloader]): print("Validating! "+ names[index]) eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 tqe = tqdm(eval_dataloader, desc="Evaluating") for input_ids, input_mask, segment_ids, label_ids in tqe: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 tqe.set_description(names[index]+" Loss:"+str(eval_loss/nb_eval_steps)+",Acc:"+str(eval_accuracy/nb_eval_examples)) eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss/nb_tr_steps if args.do_train else None eps =str(ep)+"_"+names[index] result['eval_loss_'+eps]= eval_loss result['eval_accuracy_'+eps]= eval_accuracy result['global_step_'+eps]= global_step result['loss_'+eps]= loss print(result) if names[index] == "Val": max_acc = max(max_acc,eval_accuracy) if eval_accuracy == max_acc: print("Storing Current Best Model") model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "best_model.bin") torch.save(model_to_save.state_dict(), output_model_file) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
[p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 } # any one of no_decay in p ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learn_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=total_train_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learn_rate, warmup=args.warmup_proportion, t_total=total_train_steps) ########################################################################################################################
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_corpus", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir) and ( args.local_rank == -1 or torch.distributed.get_rank() == 0): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_corpus) train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, segment_ids, input_mask, labels=lm_label_ids, next_sentence_label=is_next) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 # Save a trained model if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
# Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--max_pron_length", default=5, type=int, help="The maximum total input sequence length after pronounciation tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--pron_emb_size", default=16, type=int, help="The embedding size of pronounciation embedding.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_pron", action='store_true', help='Whether to use pronunciation as features.') parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if "h**o" in args.data_dir: mark = "h**o-" elif "hete" in args.data_dir: mark = "hete-" score_file = "scores/" + mark + '/' if not os.path.isdir(score_file): os.mkdir(score_file) args.output_dir = score_file + args.output_dir if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None all_examples = processor.get_train_examples(args.data_dir) all_examples = np.array(all_examples) kf = KFold(n_splits=10) kf.get_n_splits(all_examples) cv_index = -1 for train_index, test_index in kf.split(all_examples): cv_index += 1 train_examples = list(all_examples[train_index]) eval_examples = list(all_examples[test_index]) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) model = BertForSequencePronsClassification_v3.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels, max_seq_length=args.max_seq_length, max_prons_length=args.max_pron_length, pron_emb_size=args.pron_emb_size, do_pron=args.do_pron, device=device) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 prons_map = {} prons_map, prons_emb = embed_load('./data/pron.'+str(args.pron_emb_size)+'.vec') train_features, prons_map = convert_examples_to_pron_SC_features( train_examples, label_list, args.max_seq_length, args.max_pron_length, tokenizer, prons_map) eval_features, prons_map = convert_examples_to_pron_SC_features( eval_examples, label_list, args.max_seq_length, args.max_pron_length, tokenizer, prons_map) print(prons_map) prons_emb = embed_extend(prons_emb, len(prons_map)) prons_emb = torch.tensor(prons_emb, dtype=torch.float) prons_embedding = torch.nn.Embedding.from_pretrained(prons_emb) prons_embedding.weight.requires_grad=False logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_prons_ids = torch.tensor([f.prons_id for f in train_features], dtype=torch.long) all_prons_att_mask = torch.tensor([f.prons_att_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_prons_ids, all_prons_att_mask) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_prons_ids = torch.tensor([f.prons_id for f in eval_features], dtype=torch.long) all_prons_att_mask = torch.tensor([f.prons_att_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_prons_ids, all_prons_att_mask) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.train() best_score = 0 label_map = {i : label for i, label in enumerate(label_list)} logger.info("cv: {}".format(cv_index)) for index in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 y_true, y_pred = [], [] for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, prons_ids, prons_att_mask = batch prons_emb = prons_embedding(prons_ids.detach().cpu()).to(device) if not args.do_pron: prons_emb = None loss,logits,att = model(input_ids, segment_ids, input_mask, prons_emb, prons_att_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 logits = torch.argmax(F.log_softmax(logits,dim=-1),dim=-1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() y_pred.extend(logits) y_true.extend(label_ids) f1_score, recall_score, precision_score = f1_2d(y_true, y_pred) logger.info("precision, recall, f1: {}, {}, {}".format(precision_score, recall_score, f1_score)) print("loss: {}".format(tr_loss/nb_tr_examples)) y_pred, y_true = [], [] logger.info("Evaluating...") for input_ids, input_mask, segment_ids, label_ids, prons_ids, prons_att_mask in tqdm(eval_dataloader, desc="Evaluating"): prons_emb = prons_embedding(prons_ids).to(device) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) prons_ids = prons_ids.to(device) prons_att_mask = prons_att_mask.to(device) if not args.do_pron: prons_emb = None with torch.no_grad(): if args.do_pron: logits, att = model(input_ids, segment_ids, input_mask, prons_emb, prons_att_mask) else: logits = model(input_ids, segment_ids, input_mask, prons_emb, prons_att_mask) logits = torch.argmax(F.log_softmax(logits,dim=-1),dim=-1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() y_pred.extend(logits) y_true.extend(label_ids) f1_score, recall_score, precision_score = f1_2d(y_true, y_pred) logger.info("precision, recall, f1: {}, {}, {}".format(precision_score, recall_score, f1_score)) if f1_score > best_score: best_score = f1_score write_scores(score_file + 'true_'+str(cv_index), y_true) write_scores(score_file + 'pred_'+str(cv_index), y_pred) # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME + '_' + str(cv_index)) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) label_map = {i : label for i, label in enumerate(label_list)} model_config = {"bert_model":args.bert_model,"do_lower":args.do_lower_case,"max_seq_length":args.max_seq_length,"num_labels":len(label_list),"label_map":label_map} json.dump(model_config,open(os.path.join(args.output_dir,"model_config.json"),"w"))
def get_optimizer(optimizer_name: str, parameters, learning_rate: float, weight_decay=1e-5, eps=1e-5, **kwargs) -> Optimizer: from torch.optim import SGD, Adam, RMSprop, AdamW from torch_optimizer import RAdam, Lamb, DiffGrad, NovoGrad, Ranger if optimizer_name.lower() == "sgd": return SGD(parameters, learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adam": return Adam(parameters, learning_rate, weight_decay=weight_decay, eps=eps, **kwargs) # As Jeremy suggests if optimizer_name.lower() == "rms": return RMSprop(parameters, learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adamw": return AdamW(parameters, learning_rate, weight_decay=weight_decay, eps=eps, **kwargs) if optimizer_name.lower() == "radam": return RAdam(parameters, learning_rate, weight_decay=weight_decay, eps=eps, **kwargs) # As Jeremy suggests # Optimizers from torch-optimizer if optimizer_name.lower() == "ranger": return Ranger(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "lamb": return Lamb(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "diffgrad": return DiffGrad(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "novograd": return NovoGrad(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) # Optimizers from Apex (Fused version is faster on GPU with tensor cores) if optimizer_name.lower() == "fused_lamb": from apex.optimizers import FusedLAMB return FusedLAMB(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_sgd": from apex.optimizers import FusedSGD return FusedSGD(parameters, learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_adam": from apex.optimizers import FusedAdam return FusedAdam(parameters, learning_rate, eps=eps, weight_decay=weight_decay, adam_w_mode=True, **kwargs) raise ValueError("Unsupported optimizer name " + optimizer_name)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=256, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # if args.server_ip and args.server_port: # # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # import ptvsd # print("Waiting for debugger attach") # ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) # ptvsd.wait_for_attach() processors = { # "cola": ColaProcessor, # "mnli": MnliProcessor, # "mnli-mm": MnliMismatchedProcessor, # "mrpc": MrpcProcessor, # "sst-2": Sst2Processor, # "sts-b": StsbProcessor, # "qqp": QqpProcessor, # "qnli": QnliProcessor, "rte": RteProcessor # "wnli": WnliProcessor, } output_modes = { # "cola": "classification", # "mnli": "classification", # "mrpc": "classification", # "sst-2": "classification", # "sts-b": "regression", # "qqp": "classification", # "qnli": "classification", "rte": "classification" # "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() #[0,1] num_labels = len(label_list) train_examples = None num_train_optimization_steps = None if args.do_train: # train_examples = processor.get_train_examples_wenpeng('/home/wyin3/Datasets/glue_data/RTE/train.tsv') train_examples, seen_types = processor.get_examples_Yahoo_train( '/export/home/Dataset/YahooClassification/yahoo_answers_csv/zero-shot-split/train_pu_half_v1.txt', 130000) #train_pu_half_v1.txt # seen_classes=[0,2,4,6,8] num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_TRANSFORMERS_CACHE), 'distributed_{}'.format( args.local_rank)) pretrain_model_dir = 'bert-base-uncased' #FineTuneOnCombined'# FineTuneOnMNLI model = BertForSequenceClassification.from_pretrained( pretrain_model_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_unseen_acc = 0.0 max_dev_unseen_acc = 0.0 max_dev_seen_acc = 0.0 max_overall_acc = 0.0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) '''load dev set''' eval_examples, eval_label_list, eval_hypo_seen_str_indicator, eval_hypo_2_type_index = processor.get_examples_Yahoo_test( '/export/home/Dataset/YahooClassification/yahoo_answers_csv/zero-shot-split/dev.txt', seen_types) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) eval_all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) eval_all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) eval_all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) eval_all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(eval_all_input_ids, eval_all_input_mask, eval_all_segment_ids, eval_all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) '''load test set''' test_examples, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index = processor.get_examples_Yahoo_test( '/export/home/Dataset/YahooClassification/yahoo_answers_csv/zero-shot-split/test.txt', seen_types) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer, output_mode) test_all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) test_all_input_mask = torch.tensor( [f.input_mask for f in test_features], dtype=torch.long) test_all_segment_ids = torch.tensor( [f.segment_ids for f in test_features], dtype=torch.long) test_all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(test_all_input_ids, test_all_input_mask, test_all_segment_ids, test_all_label_ids) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) # print('train all_label_ids:', all_label_ids) # exit(0) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) iter_co = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() loss = loss_fct(logits[0].view(-1, num_labels), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 iter_co += 1 if iter_co % 400 == 0: ''' start evaluate on dev set after this epoch ''' model.eval() logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 preds = [] print('Evaluating...') for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) logits = logits[0] loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] ''' preds: size*2 (entail, not_entail) wenpeng added a softxmax so that each row is a prob vec ''' pred_probs = softmax(preds, axis=1)[:, 0] pred_binary_labels_harsh = [] pred_binary_labels_loose = [] for i in range(preds.shape[0]): if preds[i][0] > preds[i][1] + 0.1: pred_binary_labels_harsh.append(0) else: pred_binary_labels_harsh.append(1) if preds[i][0] > preds[i][1]: pred_binary_labels_loose.append(0) else: pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_Yahoo_zeroshot_SinglePhasePred( pred_probs, pred_binary_labels_harsh, pred_binary_labels_loose, eval_label_list, eval_hypo_seen_str_indicator, eval_hypo_2_type_index, seen_types) # result = compute_metrics('F1', preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None # test_acc = mean_f1#result.get("f1") if unseen_acc > max_dev_unseen_acc: max_dev_unseen_acc = unseen_acc print('\ndev seen_acc & acc_unseen:', seen_acc, unseen_acc, ' max_dev_unseen_acc:', max_dev_unseen_acc, '\n') ''' start evaluate on test set after this epoch ''' model.eval() logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) test_loss = 0 nb_test_steps = 0 preds = [] print('Testing...') for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) logits = logits[0] if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) # eval_loss = eval_loss / nb_eval_steps preds = preds[0] pred_probs = softmax(preds, axis=1)[:, 0] pred_binary_labels_harsh = [] pred_binary_labels_loose = [] for i in range(preds.shape[0]): if preds[i][0] > preds[i][1] + 0.1: pred_binary_labels_harsh.append(0) else: pred_binary_labels_harsh.append(1) if preds[i][0] > preds[i][1]: pred_binary_labels_loose.append(0) else: pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_Yahoo_zeroshot_SinglePhasePred( pred_probs, pred_binary_labels_harsh, pred_binary_labels_loose, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index, seen_types) if unseen_acc > max_test_unseen_acc: max_test_unseen_acc = unseen_acc print('\n\n\t test seen_acc & acc_unseen:', seen_acc, unseen_acc, ' max_test_unseen_acc:', max_test_unseen_acc, '\n')
def train(args): # load squad data for pre-training. args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) review_train_examples = np.load(os.path.join(args.review_data_dir, "data.npz")) squad_train_examples = np.load(os.path.join(args.squad_data_dir, "data.npz")) num_train_steps = args.num_train_steps # load bert pre-train data. review_train_data = TensorDataset( torch.from_numpy(review_train_examples["input_ids"]), torch.from_numpy(review_train_examples["segment_ids"]), torch.from_numpy(review_train_examples["input_mask"]), torch.from_numpy(review_train_examples["masked_lm_ids"]), torch.from_numpy(review_train_examples["next_sentence_labels"])) review_train_dataloader = DataLoader(review_train_data, sampler=RandomSampler(review_train_data), batch_size=args.train_batch_size, drop_last=True) squad_train_data = TensorDataset( torch.from_numpy(squad_train_examples["input_ids"]), torch.from_numpy(squad_train_examples["segment_ids"]), torch.from_numpy(squad_train_examples["input_mask"]), torch.from_numpy(squad_train_examples["start_positions"]), torch.from_numpy(squad_train_examples["end_positions"])) squad_train_dataloader = DataLoader(squad_train_data, sampler=RandomSampler(squad_train_data), batch_size=args.train_batch_size, drop_last=True) # we do not have any valiation for pretuning model = BertForMTPostTraining.from_pretrained(modelconfig.MODEL_ARCHIVE_MAP[args.bert_model]) if args.fp16: model.half() model.cuda() # Prepare optimizer param_optimizer = [(k, v) for k, v in model.named_parameters() if v.requires_grad == True] param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 step = 0 batch_loss = 0. model.train() model.zero_grad() training = True review_iter = iter(review_train_dataloader) squad_iter = iter(squad_train_dataloader) while training: try: batch = next(review_iter) except: review_iter = iter(review_train_dataloader) batch = next(review_iter) batch = tuple(t.cuda() for t in batch) input_ids, segment_ids, input_mask, masked_lm_ids, next_sentence_labels = batch review_loss = model("review", input_ids.long(), segment_ids.long(), input_mask.long(), masked_lm_ids.long(), next_sentence_labels.long(), None, None) try: batch = next(squad_iter) except: squad_iter = iter(squad_train_dataloader) batch = next(squad_iter) batch = tuple(t.cuda() for t in batch) input_ids, segment_ids, input_mask, start_positions, end_positions = batch squad_loss = model("squad", input_ids.long(), segment_ids.long(), input_mask.long(), None, None, start_positions.long(), end_positions.long()) loss = review_loss + squad_loss if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps batch_loss += loss if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % 50 == 0: logging.info("step %d batch_loss %f ", global_step, batch_loss) batch_loss = 0. if global_step % args.save_checkpoints_steps == 0: model.float() torch.save(model.state_dict(), os.path.join(args.output_dir, "pytorch_model_" + str(global_step) + ".bin")) if args.fp16: model.half() if global_step >= num_train_steps: training = False break step += 1 model.float() torch.save(model.state_dict(), os.path.join(args.output_dir, "pytorch_model.bin"))
def fit(model, training_iter, eval_iter, num_epoch, pbar, num_train_steps, verbose=1): # ------------------判断CUDA模式---------------------- if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() # 多GPU # n_gpu = 1 else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) # ---------------------优化器------------------------- param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps ## ---------------------GPU半精度fp16----------------------------- if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) ## ------------------------GPU单精度fp32--------------------------- else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) # ---------------------模型初始化---------------------- if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # , device_ids=[0, 1, 2] train_losses = [] eval_losses = [] train_accuracy = [] eval_accuracy = [] history = { "train_loss": train_losses, "train_acc": train_accuracy, "eval_loss": eval_losses, "eval_acc": eval_accuracy } # ------------------------训练------------------------------ best_f1 = 0 start = time.time() global_step = 0 for e in range(num_epoch): model.train() for step, batch in enumerate(training_iter): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, start_positions, end_positions, answer_types = batch start_logits, end_logits, answer_type_logits = model( input_ids, segment_ids, input_mask) train_loss = loss_fn(start_logits, end_logits, answer_type_logits, start_positions, end_positions, answer_types) if args.gradient_accumulation_steps > 1: train_loss = train_loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(train_loss) else: train_loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 start_logits, end_logits = start_logits.cpu(), end_logits.cpu() start_positions, end_positions = start_positions.cpu( ), end_positions.cpu() train_acc, f1 = qa_evaluate((start_logits, end_logits), (start_positions, end_positions)) pbar.show_process(train_acc, train_loss.item(), f1, time.time() - start, step) # -----------------------验证---------------------------- model.eval() count = 0 y_predicts, y_labels = [], [] eval_starts_predict, eval_ends_predict = [], [] eval_starts_label, eval_ends_label = [], [] eval_loss, eval_acc, eval_f1 = 0, 0, 0 with torch.no_grad(): for step, batch in enumerate(eval_iter): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, start_positions, end_positions, answer_types = batch start_logits, end_logits, answer_type_logits = model( input_ids, segment_ids, input_mask) eval_los = loss_fn(start_logits, end_logits, answer_type_logits, start_positions, end_positions, answer_types) eval_loss = eval_los + eval_loss count += 1 eval_starts_predict.append(start_logits) eval_ends_predict.append(end_logits) eval_starts_label.append(start_positions) eval_ends_label.append(end_positions) eval_starts_predicted = torch.cat(eval_starts_predict, dim=0).cpu() eval_ends_predicted = torch.cat(eval_ends_predict, dim=0).cpu() eval_starts_labeled = torch.cat(eval_starts_label, dim=0).cpu() eval_ends_labeled = torch.cat(eval_ends_label, dim=0).cpu() eval_predicted = (eval_starts_predicted, eval_ends_predicted) eval_labeled = (eval_starts_labeled, eval_ends_labeled) eval_acc, eval_f1 = qa_evaluate(eval_predicted, eval_labeled) logger.info( '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n' % (e + 1, train_loss.item(), eval_loss.item() / count, train_acc, eval_acc, eval_f1)) # 保存最好的模型 if eval_f1 > best_f1: best_f1 = eval_f1 save_model(model, args.output_dir) if e % verbose == 0: train_losses.append(train_loss.item()) train_accuracy.append(train_acc) eval_losses.append(eval_loss.item() / count) eval_accuracy.append(eval_acc) loss_acc_plot(history)