def test_model(self, config, test_data): """ BERT classifier testing. :param config: Bert model configuration in a JSON file. :param test_data: Dataset for bert model testing, composed by the information and label of each item. :return: Predictions of the Bert Model trained with train_model method. """ test_batch_size = config['test_batch_size'] local_rank = config['local_rank'] task_name = config['task_name'] no_cuda = config['no_cuda'] output_dir = config['output_dir'] do_lower_case = config['do_lower_case'] max_seq_length = config['max_seq_length'] ### Task utils if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) # BERT tokenizer tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=do_lower_case) # Using GPU device = torch.device( "cuda" if torch.cuda.is_available() and not no_cuda else "cpu") n_gpu = torch.cuda.device_count() model = BertForSequenceClassification.from_pretrained( output_dir, num_labels=num_labels) model.to(device) if local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) test_features, id2label = convert_examples_to_features( test_data, label_list, max_seq_length, tokenizer, "classification") # print("***** Running input_test *****") # print(" Num examples = %d", len(test_data)) # print(" Batch size = %d", test_batch_size) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=test_batch_size) model.eval() preds = [] for step, batch in enumerate(test_dataloader): input_ids, input_mask, segment_ids = batch input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) return preds
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default='bert-base-uncased', type=str, required=False, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default='data/run_squad/', type=str, required=False, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument("--train_file", default='/Users/liuhui/data/squad/dev-v1.1.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default='/Users/liuhui/data/squad/dev-v1.1.json', type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=64, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=True, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForQuestionAnswering.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForQuestionAnswering.from_pretrained( args.bert_model, state_dict=model_state_dict) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output_cnn_hter_cr directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") # ======== EARLY STOPPING ======== parser.add_argument("--save_best_model", action='store_true', help="Whether to do early stopping or not") # ================================ parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() processors = { "chatbot_intent": SentenceClassificationProcessor, "sentiment140_sentiment": SentenceClassificationProcessor, } num_labels_task = { "chatbot_intent": 2, "sentiment140_sentiment": 2, } labels_array = { "chatbot_intent": ["0", "1"], "sentiment140_sentiment": ["0", "1"], } labels_array_int = { "chatbot_intent": [0, 1], "sentiment140_sentiment": [0, 1], } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name](labels_array[task_name]) num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) # ======== EARLY STOPPING ======== # Load evaluation dataset for use in early stopping # Eval dataloader eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Get texts text = [] for e in eval_examples: text.append(e.text_a) eval_examples_dataloader = DataLoader(text, shuffle=False, batch_size=args.eval_batch_size) # ================================ # Training global_step = 0 nb_tr_steps = 0 tr_loss = 0 # EARLY STOPPING last_eval_loss = float("inf") if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # ======== EARLY STOPPING ======== eval_loss, _, _, _, _, _, _ = evaluate_model( args, model, device, eval_dataloader, eval_examples_dataloader, tr_loss, nb_tr_steps, global_step, batches=4) if args.save_best_model: # Save a trained model - EARLY STOPPING if eval_loss <= last_eval_loss: model_to_save = model.module if hasattr( model, 'module' ) else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) last_eval_loss = eval_loss # ================================ output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if not args.save_best_model: # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): model.eval() eval_loss, result, target, predicted, text_mismatch, target_mismatch, predicted_mismatch =\ evaluate_model(args, model, device, eval_dataloader, eval_examples_dataloader, tr_loss, nb_tr_steps, global_step) # Save model information # CHECK IF target and predicted need to be array target_asarray = np.asarray(target) predicted_asarray = np.asarray(predicted) labels = labels_array_int[task_name] result['precision_macro'], result['recall_macro'], result['f1_macro'], support =\ precision_recall_fscore_support(target_asarray, predicted_asarray, average='macro', labels=labels) result['precision_micro'], result['recall_micro'], result['f1_micro'], support =\ precision_recall_fscore_support(target_asarray, predicted_asarray, average='micro', labels=labels) result['precision_weighted'], result['recall_weighted'], result['f1_weighted'], support =\ precision_recall_fscore_support(target_asarray, predicted_asarray, average='weighted', labels=labels) result['confusion_matrix'] = confusion_matrix(target_asarray, predicted_asarray, labels=labels).tolist() output_eval_filename = "eval_results" if not args.do_train: output_eval_filename = "eval_results_test" #target_asarray = np.asarray(target) #predicted_asarray = np.asarray(predicted) classes = np.asarray(labels_array[task_name]) classes_idx = None if 'webapplications' in task_name: classes = np.asarray(['0', '1', '3', '4', '5', '6', '7']) # there is no class 2 in test classes_idx = np.asarray([0, 1, 2, 3, 4, 5, 6]) ax, fig = plot_confusion_matrix(target_asarray, predicted_asarray, classes=classes, normalize=True, title='Normalized confusion matrix', rotate=False, classes_idx=classes_idx) fig.savefig( os.path.join(args.output_dir, output_eval_filename + "_confusion.png")) output_eval_file = os.path.join(args.output_dir, output_eval_filename + ".json") with open(output_eval_file, "w") as writer: json.dump(result, writer, indent=2) output_eval_file = os.path.join(args.output_dir, output_eval_filename + ".txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # Example sentences and original and predicted labels output_eval_filename = "eval_examples" if not args.do_train: output_eval_filename = "eval_examples_test" dev_dict = { 'sentence': text, 'label_target': target, 'label_prediction': predicted } keys = ['sentence', 'label_target', 'label_prediction'] output_examples_file = os.path.join(args.output_dir, output_eval_filename + ".tsv") file_test = open(output_examples_file, 'wt') dict_writer = csv.writer(file_test, delimiter='\t') dict_writer.writerow(keys) r = zip(*dev_dict.values()) for d in r: dict_writer.writerow(d) # Mismatched example sentences and original and predicted labels dev_dict = { 'sentence': text_mismatch, 'label_target': target_mismatch, 'label_prediction': predicted_mismatch } keys = ['sentence', 'label_target', 'label_prediction'] output_examples_file = os.path.join( args.output_dir, output_eval_filename + "_mismatch.tsv") file_test = open(output_examples_file, 'wt') dict_writer = csv.writer(file_test, delimiter='\t') dict_writer.writerow(keys) r = zip(*dev_dict.values()) for d in r: dict_writer.writerow(d)
dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask)
class Solver(object): def __init__(self): """ :param config: easydict """ self.version = __version__ # logging.info("PyTorch Version {}, Solver Version {}".format(torch.__version__, self.version)) self.distributed = False self.world_size = 1 self.local_rank = 0 self.epoch = 0 self.iteration = 0 self.config = None self.model, self.optimizer, self.lr_policy = None, None, None self.step_decay = 1 self.filtered_keys = None if 'WORLD_SIZE' in os.environ: self.world_size = int(os.environ['WORLD_SIZE']) self.distributed = self.world_size > 1 or torch.cuda.device_count( ) > 1 if self.distributed: dist.init_process_group(backend="nccl", init_method='env://') self.local_rank = dist.get_rank() torch.cuda.set_device(self.local_rank) logging.info( '[distributed mode] world size: {}, local rank: {}.'.format( self.world_size, self.local_rank)) else: logging.info('[Single GPU mode]') def _build_environ(self): if self.config['environ']['deterministic']: cudnn.benchmark = False cudnn.deterministic = True torch.set_printoptions(precision=10) else: cudnn.benchmark = True if self.config['apex']: assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." # set random seed torch.manual_seed(self.config['environ']['seed']) if torch.cuda.is_available(): torch.cuda.manual_seed(self.config['environ']['seed']) np.random.seed(self.config['environ']['seed']) random.seed(self.config['environ']['seed']) # grad clip settings self.grad_clip_params = self.config["solver"]["optimizer"].get( "grad_clip") self.use_grad_clip = True if self.grad_clip_params is not None else False if self.use_grad_clip: logging.info("Using grad clip and params is {}".format( self.grad_clip_params)) else: logging.info("Not Using grad clip.") def init_from_scratch(self, config): t_start = time.time() self.config = config self._build_environ() # model and optimizer self.model = _get_model(self.config) self.filtered_keys = [ p.name for p in inspect.signature(self.model.forward).parameters.values() ] # logging.info("filtered keys:{}".format(self.filtered_keys)) # model_params = filter(lambda p: p.requires_grad, self.model.parameters()) model_params = [] for params in self.model.optimizer_params(): params["lr"] = self.config["solver"]["optimizer"]["params"][ "lr"] * params["lr"] model_params.append(params) self.optimizer = _get_optimizer(config['solver']['optimizer'], model_params=model_params) self.lr_policy = _get_lr_policy(config['solver']['lr_policy'], optimizer=self.optimizer) self.step_decay = config['solver']['step_decay'] if config['model'].get('pretrained_model') is not None: logging.info('loadding pretrained model from {}.'.format( config['model']['pretrained_model'])) load_model(self.model, config['model']['pretrained_model'], distributed=False) self.model.cuda(self.local_rank) if self.distributed: self.model = convert_syncbn_model(self.model) if self.config['apex']['amp_used']: # Initialize Amp. Amp accepts either values or strings for the optional override arguments, # for convenient interoperation with argparse. logging.info( "Initialize Amp. opt level={}, keep batchnorm fp32={}, loss_scale={}." .format(self.config['apex']['opt_level'], self.config['apex']['keep_batchnorm_fp32'], self.config['apex']['loss_scale'])) self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level=self.config['apex']['opt_level'], keep_batchnorm_fp32=self.config['apex']["keep_batchnorm_fp32"], loss_scale=self.config['apex']["loss_scale"]) if self.distributed: self.model = DistributedDataParallel(self.model) t_end = time.time() logging.info( "Init trainer from scratch, Time usage: IO: {}".format(t_end - t_start)) def init_from_checkpoint(self, continue_state_object): t_start = time.time() self.config = continue_state_object['config'] self._build_environ() self.model = _get_model(self.config) self.filtered_keys = [ p.name for p in inspect.signature(self.model.forward).parameters.values() ] # model_params = filter(lambda p: p.requires_grad, self.model.parameters()) model_params = [] for params in self.model.optimizer_params(): params["lr"] = self.config["solver"]["optimizer"]["params"][ "lr"] * params["lr"] model_params.append(params) self.optimizer = _get_optimizer(self.config['solver']['optimizer'], model_params=model_params) self.lr_policy = _get_lr_policy(self.config['solver']['lr_policy'], optimizer=self.optimizer) load_model(self.model, continue_state_object['model'], distributed=False) self.model.cuda(self.local_rank) if self.distributed: self.model = convert_syncbn_model(self.model) if self.config['apex']['amp_used']: # Initialize Amp. Amp accepts either values or strings for the optional override arguments, # for convenient interoperation with argparse. logging.info( "Initialize Amp. opt level={}, keep batchnorm fp32={}, loss_scale={}." .format(self.config['apex']['opt_level'], self.config['apex']['keep_batchnorm_fp32'], self.config['apex']['loss_scale'])) self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level=self.config['apex']['opt_level'], keep_batchnorm_fp32=self.config['apex']["keep_batchnorm_fp32"], loss_scale=self.config['apex']["loss_scale"]) amp.load_state_dict(continue_state_object['amp']) if self.distributed: self.model = DistributedDataParallel(self.model) self.optimizer.load_state_dict(continue_state_object['optimizer']) self.lr_policy.load_state_dict(continue_state_object['lr_policy']) self.step_decay = self.config['solver']['step_decay'] self.epoch = continue_state_object['epoch'] self.iteration = continue_state_object["iteration"] del continue_state_object t_end = time.time() logging.info( "Init trainer from checkpoint, Time usage: IO: {}".format(t_end - t_start)) def parse_kwargs(self, minibatch): kwargs = { k: v for k, v in minibatch.items() if k in self.filtered_keys } if torch.cuda.is_available(): kwargs = tensor2cuda(kwargs) return kwargs def step(self, **kwargs): """ :param kwargs: :return: """ self.iteration += 1 loss = self.model(**kwargs) loss /= self.step_decay # backward if self.distributed and self.config['apex']['amp_used']: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if self.iteration % self.step_decay == 0: if self.use_grad_clip: clip_grad_norm_(self.model.parameters(), **self.grad_clip_params) self.optimizer.step() self.optimizer.zero_grad() self.lr_policy.step(self.epoch) if self.distributed: reduced_loss = reduce_tensor(loss.data, self.world_size) else: reduced_loss = loss.data return reduced_loss def step_no_grad(self, **kwargs): with torch.no_grad(): out = self.model(**kwargs) return out def before_epoch(self, epoch): synchronize() self.iteration = 0 self.epoch = epoch self.model.train() # self.lr_policy.step(epoch) torch.cuda.empty_cache() def after_epoch(self, epoch=None): synchronize() self.model.eval() # gc.collect() torch.cuda.empty_cache() def save_checkpoint(self, path): if self.local_rank == 0: # logging.info("Saving checkpoint to file {}".format(path)) t_start = time.time() state_dict = {} from collections import OrderedDict new_state_dict = OrderedDict() for k, v in self.model.state_dict().items(): key = k if k.split('.')[0] == 'module': key = k[7:] new_state_dict[key] = v if self.config['apex']['amp_used']: state_dict['amp'] = amp.state_dict() state_dict['config'] = self.config state_dict['model'] = new_state_dict state_dict['optimizer'] = self.optimizer.state_dict() state_dict['lr_policy'] = self.lr_policy.state_dict() state_dict['epoch'] = self.epoch state_dict['iteration'] = self.iteration t_iobegin = time.time() torch.save(state_dict, path) del state_dict del new_state_dict t_end = time.time() logging.info("Save checkpoint to file {}, " "Time usage:\n\tprepare snapshot: {}, IO: {}".format( path, t_iobegin - t_start, t_end - t_iobegin)) def get_learning_rates(self): lrs = [] for i in range(len(self.optimizer.param_groups)): lrs.append(self.optimizer.param_groups[i]['lr']) return lrs
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default="", type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--model_file", default="", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese or any pretrained model directory with model.bin and config file" ) parser.add_argument( "--bert_model", default="", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default="", type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default="", type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--num_parts_start", default=-1, type=int, required=True, help="Number of partitions to run train and test on") parser.add_argument("--num_parts_end", default=-1, type=int, required=True, help="Number of partitions to run train and test on") parser.add_argument("--task_num", default=-1, type=int, required=True, help="Number of partitions to run train and test on") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "clinicalhedges": InputProcessor, } num_labels_task = { "clinicalhedges": [2, 2, 2, 2], } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() task_num = args.task_num if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() print(processor) num_labels = num_labels_task[task_name][task_num - 1] print(num_labels) label_list = processor.get_labels(task_num - 1) print(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) file = open( os.path.join(args.output_dir, "Classification_Reports_Task_{}.txt".format(task_num)), 'w') for part_index in range(args.num_parts_start, args.num_parts_end): train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples( args.data_dir, part_index, task_num) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.model_file, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info( "***** Running training on Part {} Task {}*****".format( part_index, task_num)) logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for ep in trange(int(args.num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 eval_examples = processor.get_dev_examples( args.data_dir, part_index, task_num) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) print("\n") print("Running evaluation for epoch: {}".format(ep)) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } for key in sorted(result.keys()): print(key, str(result[key])) print() if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self if (os.path.exists( os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num)))): shutil.rmtree( os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num))) os.mkdir( os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num))) output_model_file = os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num), WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num), CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) if args.do_eval: # Load a trained model and config that you have fine-tuned output_model_file = os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num), WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num), CONFIG_NAME) config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict( torch.load(output_model_file, map_location='cpu')) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_test_examples( args.data_dir, part_index, task_num) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) complete_user_ids = list() for example in eval_examples: complete_user_ids.append(example.guid) logger.info("***** Running Test for Part {} Task {}*****".format( part_index, task_num)) logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 complete_label_ids = list() complete_outputs = list() complete_probs = list() for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) last_layer_op = copy.deepcopy(logits) logits = logits.detach().cpu().numpy() sm = torch.nn.Softmax() probabilities = sm(last_layer_op) probabilities = probabilities.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) outputs = np.argmax(logits, axis=1) complete_outputs.extend(outputs) complete_label_ids.extend(label_ids) complete_probs.extend(probabilities[:, 1]) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 outcsv = open(os.path.join( args.output_dir, "Reqd_Labels_Part_{}_Task_{}.csv".format(part_index, task_num)), 'w', encoding='utf8', newline='') writer = csv.writer(outcsv, quotechar='"') writer.writerow(["ID", "True", "Pred"]) for user, true, pred, prob in zip(complete_user_ids, complete_label_ids, complete_outputs, complete_probs): writer.writerow([user, true, pred, prob]) outcsv.close() eval_loss = eval_loss / nb_eval_steps eval_loss = eval_loss / nb_eval_steps eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) file.write( "\nClassification Report Part- {}\n\n".format(part_index) + classification_report(complete_label_ids, complete_outputs) + "\n\n\n") file.close()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--ernie_model", default="bert-base-cased", type=str, help="Ernie pre-trained model") parser.add_argument("--embedding", default="wikipedia2vec-base-cased", type=str, help="Embeddings") parser.add_argument( "--mapper", default="wikipedia2vec-base-cased.bert-base-cased.linear", type=str, help="Embeddings") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--ent", required=True, choices=("none", "concat", "replace"), help="How to use entity embeddings.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", default=False, action='store_true', help="Whether to run eval on the test set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=4, type=int, help="Total batch size for evaluation.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--threshold', type=float, default=.3) args = parser.parse_args() processors = FewrelProcessor num_labels_task = 80 if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) processor = processors() num_labels = num_labels_task label_list = None embedding = MappedEmbedding(load_embedding(args.embedding), load_mapper(args.mapper)) model_embedding = load_embedding(args.ernie_model) tokenizer = BertTokenizer.from_pretrained(args.ernie_model) train_examples = None num_train_steps = None train_examples, label_list = processor.get_train_examples(args.data_dir) label_list = sorted(label_list, key=lambda x: int(x[1:])) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) model = EmbInputBertForSequenceClassification.from_pretrained( args.ernie_model, num_labels=num_labels, output_attentions=True) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_proportion * t_total, t_total=t_total) global_step = 0 output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") patterns = ["# {name} #", "$ {name} $"] if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, args.threshold, patterns=patterns, ent_type=args.ent, embedding=embedding) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) output_loss_file = os.path.join(args.output_dir, "loss") loss_fout = open(output_loss_file, 'w') model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): input_words = tokenizer.convert_ids_to_tokens( batch[0].cpu().numpy().flatten()) input_vecs = [ embedding[w] if w.startswith("ENTITY/") else model_embedding[w] for w in input_words ] input_vecs = np.array(input_vecs).reshape(batch[0].shape + (-1, )) batch[0] = torch.tensor(input_vecs) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() loss_fout.write("{}\n".format(loss.item())) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 model_to_save = model.module if hasattr(model, 'module') else model output_model_file_step = os.path.join( args.output_dir, "pytorch_model.bin_{}".format(global_step)) torch.save(model_to_save.state_dict(), output_model_file_step) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) if args.do_eval or args.do_test: del model output_model_files = [ f for f in os.listdir(args.output_dir) if f.startswith("pytorch_model.bin") ] #output_model_files = ["pytorch_model.bin"] #TODO for output_model_file in output_model_files: model = EmbInputBertForSequenceClassification.from_pretrained( args.ernie_model, num_labels=num_labels) model.load_state_dict( torch.load(os.path.join(args.output_dir, output_model_file))) if args.fp16: model.half() model.to(device) model.eval() dsets = [] if args.do_eval: dsets.append((processor.get_dev_examples, "eval")) if args.do_test: dsets.append((processor.get_test_examples, "test")) for dset_func, dset_name in dsets: features = convert_examples_to_features(dset_func( args.data_dir), label_list, args.max_seq_length, tokenizer, args.threshold, patterns=patterns, ent_type=args.ent, embedding=embedding) step = output_model_file.replace("pytorch_model.bin", "") fpred = open( os.path.join(args.output_dir, dset_name + f"_pred{step}.txt"), "w") fgold = open( os.path.join(args.output_dir, dset_name + f"_gold{step}.txt"), "w") fwords = open( os.path.join(args.output_dir, dset_name + f"_words{step}.txt"), "w") all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) dataloader = DataLoader(data, sampler=None, batch_size=args.eval_batch_size) acc = [] all_probs = [] for step, batch in enumerate( tqdm(dataloader, desc="Evaluation {} {}".format( output_model_file, dset_name))): input_words = tokenizer.convert_ids_to_tokens( batch[0].cpu().numpy().flatten()) input_vecs = [ embedding[w] if w.startswith("ENTITY/") else model_embedding[w] for w in input_words ] input_vecs = np.array(input_vecs).reshape(batch[0].shape + (-1, )) batch[0] = torch.tensor(input_vecs) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids)[0] prob = torch.softmax(logits, -1) all_probs.append(prob.detach().cpu().numpy()) predictions = prob.argmax(-1) for a, b in zip(predictions, label_ids): fgold.write("{}\n".format(label_list[b])) fpred.write("{}\n".format(label_list[a]))
def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info( f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}" ) # Load config config = BertConfig.from_json_file(args.config_file) # Load task config with open(args.tasks_config_file, "r") as f: task_cfg = edict(yaml.safe_load(f)) task_id = args.task.strip() task = "TASK" + task_id # Output dirs if "/" in args.from_pretrained: timeStamp = args.from_pretrained.split("/")[1] else: timeStamp = args.from_pretrained savePath = os.path.join(args.output_dir, timeStamp) if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Dataset batch_size, task2num_iters, dset_val, dl_val = LoadDatasetEval( args, config, task_cfg, args.task) # Model if args.zero_shot: config.visual_target_weights = {} model = BertForVLPreTraining.from_pretrained(args.from_pretrained, config=config) else: model = BertForVLTasks.from_pretrained(args.from_pretrained, config=config, task_cfg=task_cfg, task_ids=[task]) # Move to GPU(s) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, deay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) raise ValueError("Please run with a single GPU") # Print summary if default_gpu: print("***** Running evaluation *****") print(" Num Iters: ", task2num_iters) print(" Batch size: ", batch_size) # Evaluate model.eval() results = [] others = [] score_matrix = np.zeros((5000, 1000)) target_matrix = np.zeros((5000, 1000)) rank_matrix = np.ones(5000) * 1000 count = 0 for i, batch in tqdm(enumerate(dl_val), total=task2num_iters[task]): batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch) features, spatials, image_mask, question, input_mask, segment_ids, target, caption_idx, image_idx = batch features = features.squeeze(0) spatials = spatials.squeeze(0) image_mask = image_mask.squeeze(0) question = question.repeat(features.size(0), 1) segment_ids = segment_ids.repeat(features.size(0), 1) input_mask = input_mask.repeat(features.size(0), 1) with torch.no_grad(): if args.zero_shot: _, _, vil_logit, _, _ = model(question, features, spatials, segment_ids, input_mask, image_mask) score_matrix[caption_idx, image_idx * 500:(image_idx + 1) * 500] = (torch.softmax( vil_logit, dim=1)[:, 0].view(-1).cpu().numpy()) target_matrix[caption_idx, image_idx * 500:(image_idx + 1) * 500] = (target.view(-1).float().cpu().numpy()) else: vil_logit, _, _, _ = model(question, features, spatials, task, segment_ids, input_mask, image_mask) score_matrix[caption_idx, image_idx * 500:(image_idx + 1) * 500] = (vil_logit.view(-1).cpu().numpy()) target_matrix[caption_idx, image_idx * 500:(image_idx + 1) * 500] = (target.view(-1).float().cpu().numpy()) if image_idx.item() == 1: rank = np.where( (np.argsort(-score_matrix[caption_idx]) == np.where( target_matrix[caption_idx] == 1)[0][0]) == 1)[0][0] rank_matrix[caption_idx] = rank rank_matrix_tmp = rank_matrix[:caption_idx + 1] r1 = 100.0 * np.sum(rank_matrix_tmp < 1) / len(rank_matrix_tmp) r5 = 100.0 * np.sum(rank_matrix_tmp < 5) / len(rank_matrix_tmp) r10 = 100.0 * np.sum( rank_matrix_tmp < 10) / len(rank_matrix_tmp) medr = np.floor(np.median(rank_matrix_tmp) + 1) meanr = np.mean(rank_matrix_tmp) + 1 print( "%d Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (count, r1, r5, r10, medr, meanr)) results.append( np.argsort(-score_matrix[caption_idx]).tolist()[:20]) count += 1 r1 = 100.0 * np.sum(rank_matrix < 1) / len(rank_matrix) r5 = 100.0 * np.sum(rank_matrix < 5) / len(rank_matrix) r10 = 100.0 * np.sum(rank_matrix < 10) / len(rank_matrix) medr = np.floor(np.median(rank_matrix) + 1) meanr = np.mean(rank_matrix) + 1 print("************************************************") print("****************Image Retrieval*****************") print("************************************************") print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (r1, r5, r10, medr, meanr)) print("************************************************") if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]["val_split"]) json.dump(results, open(json_path + "_result.json", "w")) json.dump(others, open(json_path + "_others.json", "w")) # Text Retrieval rank_matrix = np.zeros(1000) for image_idx in range(1000): ranks = [] tgt_captions = np.where(target_matrix[:, image_idx] == 1)[0] sorted_scores = np.argsort(-score_matrix[:, image_idx]) for tgt_caption in tgt_captions: ranks.append(np.where((sorted_scores == tgt_caption) == 1)[0][0]) rank_matrix[image_idx] = min(ranks) r1 = 100.0 * np.sum(rank_matrix < 1) / len(rank_matrix) r5 = 100.0 * np.sum(rank_matrix < 5) / len(rank_matrix) r10 = 100.0 * np.sum(rank_matrix < 10) / len(rank_matrix) medr = np.floor(np.median(rank_matrix) + 1) meanr = np.mean(rank_matrix) + 1 print("************************************************") print("****************Text Retrieval******************") print("************************************************") print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (r1, r5, r10, medr, meanr)) print("************************************************")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "aus": OOCLAUSProcessor, "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } num_labels_task = { "aus": 33, "cola": 2, "mnli": 3, "mrpc": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = BertForDocMultiClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 initial_labeled_samples = 200 accuracies = [] sample_selection_fucntion = RandomSelection sample_count = [] train_set_size = 1000 quried = 0 max_quired = 500 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids, all_input_mask, all_segment_ids, all_label_ids = to_numpy_data( train_features) permutation, input_ids_train, input_mask_train, segment_ids_train, label_ids_train = get_k_random_samples( initial_labeled_samples, all_input_ids, all_input_mask, all_segment_ids, all_label_ids) sample_count.append(initial_labeled_samples) quried = initial_labeled_samples input_ids_val, input_mask_val, segment_ids_val, label_ids_val, train_data, val_data = prepare_train_val( all_input_ids, all_input_mask, all_segment_ids, all_label_ids, input_ids_train, input_mask_train, segment_ids_train, label_ids_train, permutation) if args.local_rank == -1: train_sampler = SequentialSampler(train_data) val_sampler = SequentialSampler(val_data) else: train_sampler = DistributedSampler(train_data) val_sampler = DistributedSampler(val_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 activate_iteration = 1 model.eval() val_len = len(input_ids_val) accuracies = [] X_val_probas = [] for input_ids, input_mask, segment_ids, label_ids in val_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() #print("logits:", logits) #print("logits size:", len(logits)) label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) accuracies.append(tmp_eval_accuracy) print() print("accuracy:", np.mean(accuracies)) while quried < max_quired: activate_iteration += 1 print("iter:", activate_iteration) probas_val = [] y_val = [] accuracies = [] model.eval() for input_ids, input_mask, segment_ids, label_ids in val_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() probas_val.extend(logits) label_ids = label_ids.to('cpu').numpy() y_val.extend(label_ids) tmp_eval_accuracy = accuracy(logits, label_ids) accuracies.append(tmp_eval_accuracy) print("logits shape:", len(probas_val)) print("y_val shape", len(y_val)) uncertain_samples = sample_selection_fucntion.select( np.array(probas_val), initial_labeled_samples) print(uncertain_samples) input_ids_train, input_ids_val, input_mask_train, input_mask_val, segment_ids_train, segment_ids_val, label_ids_train, \ label_ids_val, train_data, val_data = resplit_train_eval(np.array(input_ids_train), np.array(input_ids_val), np.array(input_mask_train), np.array(input_mask_val), np.array(segment_ids_train), np.array(segment_ids_val), np.array(label_ids_train), np.array(label_ids_val), np.array(uncertain_samples)) if args.local_rank == -1: train_sampler = SequentialSampler(train_data) val_sampler = SequentialSampler(val_data) else: train_sampler = DistributedSampler(train_data) val_sampler = DistributedSampler(val_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=args.train_batch_size) quried += initial_labeled_samples model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1
def main(): # ----- 准备参数 ----- parser = argparse.ArgumentParser() task_cola_args(parser) capsule_args(parser) args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_infer: raise ValueError( "At least one of `do_train` or `do_infer` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = ColaProcessor() label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if args.upper_model == "Linear": model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) elif args.upper_model == "CNN": model = BertCnn.from_pretrained(args.bert_model, num_labels=num_labels, seq_len=args.max_seq_length) elif args.upper_model == "Capsule": model = BertCapsule.from_pretrained(args.bert_model, capsule_config=args, task_config=args) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # model = DataParallelModel(model) # todo 如何加入负载均衡 # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch out_digits, seq_emb = model(input_ids, segment_ids, input_mask, label_ids) loss = model.loss(seq_emb, out_digits, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # do eval if global_step % args.eval_freq == 0 and global_step > 0: logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): out_digits, seq_emb = model( input_ids, segment_ids, input_mask, label_ids) # 计算loss loss = model.loss(seq_emb, out_digits, label_ids) if n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tmp_eval_loss = loss eval_loss += tmp_eval_loss.mean().item() # todo 准确率测试 nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'global_step': global_step, 'loss': loss } logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) if args.do_infer: infer_examples = processor.get_infer_examples(args.data_dir) infer_features = convert_examples_to_features(infer_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running Inference *****") logger.info(" Num examples = %d", len(infer_examples)) logger.info(" Batch size = %d", args.infer_batch_size) all_input_ids = torch.tensor([f.input_ids for f in infer_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in infer_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in infer_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in infer_features], dtype=torch.long) infer_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data infer_sampler = SequentialSampler(infer_data) infer_dataloader = DataLoader(infer_data, sampler=infer_sampler, batch_size=args.infer_batch_size) model.eval() for input_ids, input_mask, segment_ids, label_ids in tqdm( infer_dataloader, desc="Inference"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): infer_preds = model(input_ids, segment_ids, input_mask, label_ids) for i in range(len(infer_preds)): infer_preds[i] = infer_preds[i].view(-1, num_labels) infer_preds = torch.cat( infer_preds) # shape: [batch_size, num_labels] logits = infer_preds.detach().cpu().numpy() outputs = np.argmax(logits, axis=1) print(outputs) logger.info("***** Infer finished *****")
class Trainer(object): def __init__(self, device, train_data, valid_data, dicts, opt, setup_optimizer=True): """ :param model: :param device: int (GPU id) :param loss_function: :param train_data: :param valid_data: :param dicts: :param opt: """ # self.model = model # self.model = model # self.loss_function = loss_function self.device = device opt.node_rank = 0 opt.nodes = 1 self.world_size = len(opt.gpus) # in the case of single node distributed, it should equal self.device self.rank = self.device # make a group to later use with dist.all_reduce self.group = dist.group.WORLD self.print("[INFO] Training Options:", opt) dist.init_process_group(backend='nccl', init_method='env://', world_size=self.world_size, rank=self.rank) self.model = None if self.rank == 0: self.train_data = train_data self.valid_data = valid_data else: self.train_data = copy.deepcopy(train_data) self.valid_data = copy.deepcopy(valid_data) self.dicts = dicts self.opt = opt self.cuda = (len(opt.gpus) >= 1 and opt.gpus[0] >= 0) assert self.cuda, "[ERROR] Training is only available on GPUs." self.start_time = 0 # setting up models and others if opt.lfv_multilingual: from onmt.models.speech_recognizer.lid_loss import CrossEntropyLIDLoss lid_loss = CrossEntropyLIDLoss(opt.n_languages, opt.label_smoothing, opt.fast_xentropy) self.loss_function.add_loss_function(lid_loss, 'lid_loss') torch.manual_seed(self.opt.seed) # note: we must start creating models after ccreating the processes # for some reason passing a pre-created model to a process creates a "pickle" error if not opt.fusion: if self.is_main(): print("BUILDING MODEL .... ", flush=True) model = build_model(opt, dicts) """ Building the loss function """ if opt.ctc_loss != 0: loss_function = NMTAndCTCLossFunc( dicts['tgt'].size(), label_smoothing=opt.label_smoothing, ctc_weight=opt.ctc_loss) elif opt.nce: from onmt.modules.nce.nce_loss import NCELoss loss_function = NCELoss(opt.model_size, dicts['tgt'].size(), noise_ratio=opt.nce_noise, logz=9, label_smoothing=opt.label_smoothing) else: loss_function = NMTLossFunc( opt.model_size, dicts['tgt'].size(), label_smoothing=opt.label_smoothing, mirror=opt.mirror_loss, fast_xentropy=opt.fast_xentropy) # This function replaces modules with the more optimized counterparts so that it can run faster # Currently exp with LayerNorm if not opt.memory_profiling: # distributed is required to convert BatchNorm to SyncBatchNorm for DDP optimize_model(model, distributed=(self.world_size > 1)) # optimize_model(model) init_model_parameters(model, opt) self.model = model self.loss_function = loss_function if self.cuda: torch.cuda.set_device(self.device) self.loss_function = self.loss_function.cuda(device=self.device) self.model = self.model.cuda(device=self.device) # Ensure that the distributed copies have the same initial parameters # Manual seed may not work the same for different GPU models. if self.world_size > 1: params = [p for p in self.model.parameters()] with torch.no_grad(): if not self.is_main(): for p in params: p.zero_() else: for p in params: p.add_(0) if self.world_size > 1: params = [p for p in self.model.parameters()] all_reduce_and_rescale_tensors(params, 1) if setup_optimizer: self.optim = onmt.Optim(opt) self.optim.set_parameters(self.model.parameters()) if self.is_main(): print("[INFO] Optimizer: ", self.optim.optimizer) if not self.opt.fp16: opt_level = "O0" keep_batchnorm_fp32 = False elif self.opt.fp16_mixed: opt_level = "O1" keep_batchnorm_fp32 = None else: opt_level = "O2" keep_batchnorm_fp32 = False if self.cuda: self.model, self.optim.optimizer = amp.initialize( self.model, self.optim.optimizer, opt_level=opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32, loss_scale="dynamic", verbosity=1 if self.opt.verbose else 0) # wrap the model into DDP after initializing by amp if self.world_size > 1: """ delay_allreduce is required to avoid allreduce error during backward pass """ self.model = DDP(self.model, delay_allreduce=True, gradient_average=False) # torch DDP is more likely to work with the official amp autocast # self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[self.rank], # output_device=self.rank, # find_unused_parameters=True) print("[INFO] Process %d ready." % self.rank, flush=True) def is_main(self): return self.rank == 0 def print(self, *content, flush=False): """ A helper function to print only on the main process :param flush: :param content: :return: """ if self.is_main(): print(*content, flush=flush) else: return def load_encoder_weight(self, checkpoint_file): print("Loading pretrained models from %s" % checkpoint_file) checkpoint = torch.load(checkpoint_file, map_location=lambda storage, loc: storage) pretrained_model = build_model(checkpoint['opt'], checkpoint['dicts']) pretrained_model.load_state_dict(checkpoint['model']) print("Loading pretrained encoder weights ...") pretrained_model.encoder.language_embedding = None enc_language_embedding = self.model.encoder.language_embedding self.model.encoder.language_embedding = None encoder_state_dict = pretrained_model.encoder.state_dict() self.model.encoder.load_state_dict(encoder_state_dict) self.model.encoder.language_embedding = enc_language_embedding return def load_decoder_weight(self, checkpoint_file): self.print("Loading pretrained models from %s" % checkpoint_file) checkpoint = torch.load(checkpoint_file, map_location=lambda storage, loc: storage) chkpoint_dict = checkpoint['dicts'] pretrained_model = build_model(checkpoint['opt'], chkpoint_dict) pretrained_model.load_state_dict(checkpoint['model']) self.print("Loading pretrained decoder weights ...") # first we have to remove the embeddings which probably have difference size ... pretrained_word_emb = pretrained_model.decoder.word_lut pretrained_model.decoder.word_lut = None pretrained_lang_emb = pretrained_model.decoder.language_embeddings pretrained_model.decoder.language_embeddings = None # actually we assume that two decoders have the same language embeddings... untrained_word_emb = self.model.decoder.word_lut self.model.decoder.word_lut = None untrained_lang_emb = self.model.decoder.language_embeddings self.model.decoder.language_embeddings = None decoder_state_dict = pretrained_model.decoder.state_dict() self.model.decoder.load_state_dict(decoder_state_dict) # now we load the embeddings .... n_copies = 0 for token in self.dicts['tgt'].labelToIdx: untrained_id = self.dicts['tgt'].labelToIdx[token] if token in chkpoint_dict['tgt'].labelToIdx: pretrained_id = chkpoint_dict['tgt'].labelToIdx[token] untrained_word_emb.weight.data[untrained_id].copy_( pretrained_word_emb.weight.data[pretrained_id]) self.model.generator[0].linear.bias.data[untrained_id].copy_( pretrained_model.generator[0].linear.bias. data[pretrained_id]) n_copies += 1 self.print("Copied embedding for %d words" % n_copies) self.model.decoder.word_lut = untrained_word_emb # now we load the language embeddings ... if pretrained_lang_emb and untrained_lang_emb and 'langs' in chkpoint_dict: for lang in self.dicts['langs']: untrained_id = self.dicts['langs'][lang] if lang in chkpoint_dict['langs']: pretrained_id = chkpoint_dict['langs'][lang] untrained_lang_emb.weight.data[untrained_id].copy_( pretrained_lang_emb.weight.data[pretrained_id]) self.model.decoder.language_embeddings = untrained_lang_emb def warm_up(self): """ Warmup the memory allocator, by attempting to fit the largest batch :return: """ # if self.opt.memory_profiling: # from pytorch_memlab import MemReporter # reporter = MemReporter() # batch = self.train_data[0].get_largest_batch() if isinstance(self.train_data, list) \ else self.train_data.get_largest_batch() opt = self.opt if self.cuda: batch.cuda(fp16=self.opt.fp16 and not self.opt.fp16_mixed) self.model.train() self.loss_function.train() self.model.zero_grad() oom = False if self.opt.memory_profiling: self.print("Input size: ") self.print(batch.size, batch.src_size, batch.tgt_size) if opt.streaming: streaming_state = self.model.init_stream() else: streaming_state = None try: targets = batch.get('target_output') tgt_mask = None outputs = self.model(batch, streaming=opt.streaming, target_mask=tgt_mask, zero_encoder=opt.zero_encoder, mirror=opt.mirror_loss, streaming_state=streaming_state, nce=opt.nce) outputs['tgt_mask'] = tgt_mask loss_dict = self.loss_function(outputs, targets, model=self.model) loss_data = loss_dict['data'] loss = loss_dict[ 'loss'] # a little trick to avoid gradient overflow with fp16 full_loss = loss if opt.mirror_loss: rev_loss = loss_dict['rev_loss'] mirror_loss = loss_dict['mirror_loss'] full_loss = full_loss + rev_loss + mirror_loss # reconstruction loss if opt.reconstruct: rec_loss = loss_dict['rec_loss'] rec_loss = rec_loss full_loss = full_loss + rec_loss if opt.lfv_multilingual: lid_logits = outputs['lid_logits'] lid_labels = batch.get('target_lang') lid_loss_function = self.loss_function.get_loss_function( 'lid_loss') lid_loss = lid_loss_function(lid_logits, lid_labels) full_loss = full_loss + lid_loss optimizer = self.optim.optimizer if self.opt.memory_profiling: reporter.report(verbose=True) # for obj in gc.get_objects(): # try: # if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): # # print(varname(obj)) # # we can rule out parameter cost later # # if 'parameter' not in type(obj): # # if len(obj.shape) == 3: # # if not isinstance(obj, torch.nn.parameter.Parameter): # # tensor = obj # # numel = tensor. # print(type(obj), obj.type(), obj.size()) # except: # pass # print("Memory profiling complete.") # print(torch.cuda.memory_summary()) # exit() if self.cuda: with amp.scale_loss(full_loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.div_(batch.tgt_size).backward() if self.opt.memory_profiling: print('========= after backward =========') reporter.report(verbose=True) self.model.zero_grad() self.optim.zero_grad() # self.optim.step() # self.optim.reset() except RuntimeError as e: if 'out of memory' in str(e): oom = True else: raise e if oom: print( "* Warning: out-of-memory in warming up. This is due to the largest batch is too big for the GPU.", flush=True) else: print("* Warming up successfully.", flush=True) if self.opt.memory_profiling: if hasattr(torch.cuda, 'memory_summary'): print(torch.cuda.memory_summary()) exit() pass def save(self, epoch, valid_ppl, itr=None): opt = self.opt model = self.model dicts = self.dicts model_state_dict = self.model.state_dict() optim_state_dict = self.optim.state_dict() if itr: itr_state_dict = itr.state_dict() else: itr_state_dict = None # drop a checkpoint checkpoint = { 'model': model_state_dict, 'dicts': dicts, 'opt': opt, 'epoch': epoch, 'itr': itr_state_dict, 'optim': optim_state_dict, 'amp': amp.state_dict() } file_name = '%s_ppl_%.6f_e%.2f.pt' % (opt.save_model, valid_ppl, epoch) print('Writing to %s' % file_name) torch.save(checkpoint, file_name) # check the save directory here checkpoint_dir = os.path.dirname(opt.save_model) existed_save_files = checkpoint_paths(checkpoint_dir) for save_file in existed_save_files[opt.keep_save_files:]: print(" * Deleting old save file %s ...." % save_file) os.remove(save_file) def eval(self, data): opt = self.opt rank = self.device world_size = self.world_size # the data iterator creates an epoch iterator data_iterator = generate_data_iterator(data, rank, world_size, seed=self.opt.seed, num_workers=opt.num_workers, epoch=1, buffer_size=opt.buffer_size) epoch_iterator = data_iterator.next_epoch_itr(False, pin_memory=False) data_size = len(epoch_iterator) i = 0 self.model.eval() self.loss_function.eval() # self.model.module.reset_states() total_loss = zero_tensor() total_words = zero_tensor() if opt.streaming: streaming_state = self.model.init_stream() else: streaming_state = None with torch.no_grad(): while not data_iterator.end_of_epoch(): samples = next(epoch_iterator) if samples: batch = prepare_sample(samples, device=self.device, fp16=self.opt.fp16 and not self.opt.fp16_mixed) targets = batch.get('target_output') tgt_mask = targets.ne(onmt.constants.PAD) outputs = self.model(batch, streaming=opt.streaming, target_mask=tgt_mask, mirror=opt.mirror_loss, streaming_state=streaming_state, nce=opt.nce) outputs['tgt_mask'] = tgt_mask loss_dict = self.loss_function(outputs, targets, model=self.model, eval=True) loss_data = loss_dict['data'] total_loss.add_(loss_data) total_words.add_(batch.tgt_size) i = i + 1 # allreduce the total loss and total words from other processes dist.all_reduce(total_loss, op=dist.ReduceOp.SUM, group=self.group) dist.all_reduce(total_words, op=dist.ReduceOp.SUM, group=self.group) self.model.train() self.loss_function.train() return total_loss / total_words def train_epoch(self, epoch, resume=False, itr_progress=None): global rec_ppl opt = self.opt train_data = self.train_data streaming = opt.streaming # Clear the gradients of the model self.model.zero_grad() # self.model.module.reset_states() dataset = train_data data_iterator = generate_data_iterator(dataset, self.rank, self.world_size, seed=self.opt.seed, num_workers=opt.num_workers, epoch=epoch, buffer_size=opt.buffer_size) # TODO: fix resume which is currently buggy if resume: data_iterator.load_state_dict(itr_progress) epoch_iterator = data_iterator.next_epoch_itr( not streaming, pin_memory=opt.pin_memory) total_tokens, total_loss, total_words = zero_tensor(), zero_tensor( ), zero_tensor() total_non_pads = zero_tensor() report_loss, report_tgt_words = zero_tensor(), zero_tensor() report_src_words = zero_tensor() report_rec_loss, report_rev_loss, report_mirror_loss = zero_tensor( ), zero_tensor(), zero_tensor() start = time.time() n_samples = len(data_iterator) counter = 0 num_accumulated_words = zero_tensor() num_accumulated_sents = zero_tensor() grad_scaler = 1 nan = False nan_counter = zero_tensor() if opt.streaming: streaming_state = self.model.init_stream() else: streaming_state = None i = data_iterator.iterations_in_epoch if not isinstance( train_data, list) else epoch_iterator.n_yielded i = i * self.world_size while not data_iterator.end_of_epoch(): curriculum = (epoch < opt.curriculum) # this batch generator is not very clean atm # TODO: move everything to the multiGPU trainer samples = next(epoch_iterator) batch = prepare_sample(samples, device=self.device, fp16=self.opt.fp16 and not self.opt.fp16_mixed) if opt.streaming: if train_data.is_new_stream(): streaming_state = self.model.init_stream() else: streaming_state = None # TODO: dealing with oom during distributed training oom = False try: # outputs is a dictionary containing keys/values necessary for loss function # can be flexibly controlled within models for easier extensibility counter = counter + 1 # reduction_disabled = False if counter >= opt.update_frequency or i == (n_samples-1) else True # self.model.require_backward_grad_sync = not reduction_disabled targets = batch.get('target_output') tgt_mask = targets.ne(onmt.constants.PAD) outputs = self.model(batch, streaming=opt.streaming, target_mask=tgt_mask, zero_encoder=opt.zero_encoder, mirror=opt.mirror_loss, streaming_state=streaming_state, nce=opt.nce) batch_size = batch.size outputs['tgt_mask'] = tgt_mask loss_dict = self.loss_function(outputs, targets, model=self.model) loss_data = loss_dict['data'] loss = loss_dict[ 'loss'] # a little trick to avoid gradient overflow with fp16 full_loss = loss if opt.mirror_loss: rev_loss = loss_dict['rev_loss'] rev_loss_data = loss_dict['rev_loss_data'] mirror_loss = loss_dict['mirror_loss'] full_loss = full_loss + rev_loss + mirror_loss mirror_loss_data = loss_dict['mirror_loss'].item() else: rev_loss_data = None mirror_loss_data = 0 # reconstruction loss if opt.reconstruct: rec_loss = loss_dict['rec_loss'] rec_loss = rec_loss full_loss = full_loss + rec_loss rec_loss_data = loss_dict['rec_loss_data'] else: rec_loss_data = None if opt.lfv_multilingual: lid_logits = outputs['lid_logits'] lid_labels = batch.get('target_lang') lid_loss_function = self.loss_function.get_loss_function( 'lid_loss') lid_loss = lid_loss_function(lid_logits, lid_labels) full_loss = full_loss + lid_loss optimizer = self.optim.optimizer # When the batch size is large, each gradient step is very easy to explode on fp16 # Normalizing the loss to grad scaler ensures this will not happen full_loss.div_(grad_scaler) # reduction_disabled = False with amp.scale_loss(full_loss, optimizer) as scaled_loss: scaled_loss.backward() del outputs except RuntimeError as e: if 'out of memory' in str(e): print('[WARNING]: ran out of memory on GPU %d' % self.rank, flush=True) oom = True torch.cuda.empty_cache() loss = 0 if opt.streaming: # reset stream in this case ... streaming_state = self.model.init_stream() raise e else: raise e batch_size = batch.size src_size = batch.src_size tgt_size = batch.tgt_size num_accumulated_words.add_(tgt_size) num_accumulated_sents.add_(batch_size) # We only update the parameters after getting gradients from n mini-batches update_flag = False if counter >= opt.update_frequency: update_flag = True elif i == n_samples - 1: # update for the last minibatch update_flag = True if update_flag: # accumulated gradient case, in this case the update frequency dist.all_reduce(num_accumulated_words, op=dist.ReduceOp.SUM, group=self.group) # if (counter == 1 and self.opt.update_frequency != 1) or counter > 1: grad_denom = 1 / grad_scaler if self.opt.normalize_gradient: grad_denom = num_accumulated_words.item() * grad_denom else: grad_denom = 1 # When we accumulate the gradients, each gradient is already normalized by a constant grad_scaler normalize_gradients(amp.master_params(optimizer), grad_denom) # Update the parameters. self.optim.step() self.optim.zero_grad() self.model.zero_grad() counter = 0 num_accumulated_words.zero_() num_accumulated_sents.zero_() num_updates = self.optim._step if opt.save_every > 0 and num_updates % opt.save_every == -1 % opt.save_every: valid_loss = self.eval(self.valid_data) valid_ppl = math.exp(min(valid_loss, 100)) if self.is_main(): print('Validation perplexity: %g' % valid_ppl) ep = float(epoch) - 1. + ((float(i) + 1.) / n_samples) self.save(ep, valid_ppl, itr=data_iterator) num_words = tgt_size report_loss.add_(loss_data) report_tgt_words.add_(num_words) report_src_words.add_(src_size) total_loss.add_(loss_data) total_words.add_(num_words) # total_tokens += batch.get('target_output').nelement() # total_non_pads += batch.get('target_output').ne(onmt.constants.PAD).sum().item() # batch_efficiency = total_non_pads / total_tokens if opt.reconstruct: report_rec_loss.add_(rec_loss_data) if opt.mirror_loss: report_rev_loss.add_(rev_loss_data) report_mirror_loss.add_(mirror_loss_data) # control the index a little bit to ensure the log is always printed if i == 0 or ((i + 1) % opt.log_interval < self.world_size): dist.all_reduce(report_loss, op=dist.ReduceOp.SUM, group=self.group) dist.all_reduce(report_tgt_words, op=dist.ReduceOp.SUM, group=self.group) dist.all_reduce(report_src_words, op=dist.ReduceOp.SUM, group=self.group) if self.is_main(): log_string = ("Epoch %2d, %5d/%5d; ; ppl: %6.2f ; " % (epoch, i + 1, len(data_iterator), math.exp(report_loss.item() / report_tgt_words.item()))) if opt.reconstruct: dist.all_reduce(report_rec_loss, op=dist.ReduceOp.SUM, group=self.group) rec_ppl = math.exp(report_rec_loss.item() / report_src_words.item()) log_string += (" rec_ppl: %6.2f ; " % rec_ppl) if opt.mirror_loss: dist.all_reduce(report_rev_loss, op=dist.ReduceOp.SUM, group=self.group) rev_ppl = math.exp(report_rev_loss.item() / report_tgt_words.item()) log_string += (" rev_ppl: %6.2f ; " % rev_ppl) log_string += (" mir_loss: %6.2f ; " % (report_mirror_loss / report_tgt_words)) log_string += ( "lr: %.7f ; updates: %7d; " % (self.optim.getLearningRate(), self.optim._step)) log_string += ( "%5.0f src tok/s; %5.0f tgt tok/s; " % (report_src_words.item() / (time.time() - start), report_tgt_words.item() / (time.time() - start))) log_string += ("%s elapsed" % str( datetime.timedelta(seconds=int(time.time() - self.start_time)))) self.print(log_string, flush=True) report_loss.zero_() report_tgt_words.zero_() report_src_words.zero_() report_rec_loss.zero_() report_rev_loss.zero_() report_mirror_loss.zero_() start = time.time() # increase i by world size i = i + self.world_size return total_loss / total_words # def run(self, save_file=None): def run(self, checkpoint=None): opt = self.opt if checkpoint is not None: raise NotImplementedError # TODO: have loading checkpoints for each process self.model.load_state_dict(checkpoint['model']) prec_opt = checkpoint['opt'] if 'opt' in checkpoint else None opt.reset_optim = True if not opt.reset_optim: if self.is_main(): print("* Loading optimizer states ... ") self.optim.load_state_dict(checkpoint['optim']) if prec_opt is not None and hasattr(prec_opt, "fp16_mixed"): # Only load amp information if the mode is the same # Maybe its better to change between optimization mode? if opt.fp16_mixed == prec_opt.fp16_mixed and opt.fp16 == prec_opt.fp16: if 'amp' in checkpoint: try: amp.load_state_dict(checkpoint['amp']) except Exception: # loading the amp state can fail pass # Only load the progress when we use the same optimizer if 'itr' in checkpoint: itr_progress = checkpoint['itr'] else: itr_progress = None resume = True start_epoch = checkpoint[ 'epoch'] if 'epoch' in checkpoint else 1 if start_epoch is None: start_epoch = 1 else: itr_progress = None resume = False start_epoch = 1 del checkpoint['model'] optim_state_dict = checkpoint['optim'] # del checkpoint['optim'] del checkpoint else: itr_progress = None resume = False start_epoch = 1 if opt.load_encoder_from: self.load_encoder_weight(opt.load_encoder_from) # if opt.load_decoder_from: self.load_decoder_weight(opt.load_decoder_from) # if we are on a GPU: warm up the memory allocator if self.cuda: self.warm_up() valid_loss = self.eval(self.valid_data) valid_ppl = math.exp(min(valid_loss, 100)) if self.is_main(): print('[INFO] Validation perplexity: %g' % valid_ppl, flush=True) self.start_time = time.time() for epoch in range(start_epoch, start_epoch + opt.epochs): self.print('') # (1) train for one epoch on the training set train_loss = self.train_epoch(epoch, resume=resume, itr_progress=itr_progress) train_ppl = math.exp(min(train_loss, 100)) self.print('[INFO] Train perplexity: %g' % train_ppl) # (2) evaluate on the validation set valid_loss = self.eval(self.valid_data) valid_ppl = math.exp(min(valid_loss, 100)) if self.is_main(): print('[INFO] Validation perplexity: %g' % valid_ppl) self.save(epoch, valid_ppl) itr_progress = None resume = False
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( '--classifier', default='guoday', type=str, required=True, help='classifier type, guoday or MLP or GRU_MLP or ...') parser.add_argument('--optimizer', default='RAdam', type=str, required=True, help='optimizer we use, RAdam or ...') parser.add_argument("--do_label_smoothing", default='yes', type=str, required=True, help="Whether to do label smoothing. yes or no.") parser.add_argument('--draw_loss_steps', default=1, type=int, required=True, help='training steps to draw loss') parser.add_argument('--label_name', default='label', type=str, required=True, help='label name in original train set index') ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", default='yes', type=str, required=True, help="Whether to run training. yes or no.") parser.add_argument("--do_test", default='yes', type=str, required=True, help="Whether to run training. yes or no.") parser.add_argument("--do_eval", default='yes', type=str, required=True, help="Whether to run eval on the dev set. yes or no.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=200, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) # tensorboard_log_dir = args.output_dir # loss_now = tf.placeholder(dtype=tf.float32, name='loss_now') # loss_mean = tf.placeholder(dtype=tf.float32, name='loss_mean') # loss_now_variable = loss_now # loss_mean_variable = loss_mean # train_loss = tf.summary.scalar('train_loss', loss_now_variable) # dev_loss_mean = tf.summary.scalar('dev_loss_mean', loss_mean_variable) # merged = tf.summary.merge([train_loss, dev_loss_mean]) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) config.hidden_dropout_prob = args.dropout # Prepare model if args.do_train == 'yes': model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train == 'yes': print( '________________________now training______________________________' ) # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True, label_name=args.label_name) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) # print('train_feature_size=', train_features.__sizeof__()) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # print('train_data=',train_data[0]) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.optimizer == 'RAdam': optimizer = RAdam(optimizer_grouped_parameters, lr=args.learning_rate) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 loss_batch = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # with tf.Session() as sess: # summary_writer = tf.summary.FileWriter(tensorboard_log_dir, sess.graph) # sess.run(tf.global_variables_initializer()) list_loss_mean = [] bx = [] eval_F1 = [] ax = [] for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() loss_batch += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: # optimizer.backward(loss) loss.backward() else: loss.backward() # draw loss every n docs if (step + 1) % int(args.draw_loss_steps / (args.train_batch_size / args.gradient_accumulation_steps)) == 0: list_loss_mean.append(round(loss_batch, 4)) bx.append(step + 1) plt.plot(bx, list_loss_mean, label='loss_mean', linewidth=1, color='b', marker='o', markerfacecolor='green', markersize=2) plt.savefig(args.output_dir + '/labeled.jpg') loss_batch = 0 # paras update every batch data. if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 # report results every 200 real batch. if step % (args.eval_steps * args.gradient_accumulation_steps) == 0 and step > 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) # do evaluation totally 10 times during training stage. if args.do_eval == 'yes' and (step + 1) % int( num_train_optimization_steps / 10) == 0 and step > 450: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True, label_name=args.label_name) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_labels = np.concatenate(inference_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() ############################################### num_gold_0 = np.sum(gold_labels == 0) num_gold_1 = np.sum(gold_labels == 1) num_gold_2 = np.sum(gold_labels == 2) right_0 = 0 right_1 = 0 right_2 = 0 error_0 = 0 error_1 = 0 error_2 = 0 for gold_label, inference_label in zip( gold_labels, inference_labels): if gold_label == inference_label: if gold_label == 0: right_0 += 1 elif gold_label == 1: right_1 += 1 else: right_2 += 1 elif inference_label == 0: error_0 += 1 elif inference_label == 1: error_1 += 1 else: error_2 += 1 recall_0 = right_0 / (num_gold_0 + 1e-5) recall_1 = right_1 / (num_gold_1 + 1e-5) recall_2 = right_2 / (num_gold_2 + 1e-5) precision_0 = right_0 / (error_0 + right_0 + 1e-5) precision_1 = right_1 / (error_1 + right_1 + 1e-5) precision_2 = right_2 / (error_2 + right_2 + 1e-5) f10 = 2 * precision_0 * recall_0 / (precision_0 + recall_0 + 1e-5) f11 = 2 * precision_1 * recall_1 / (precision_1 + recall_1 + 1e-5) f12 = 2 * precision_2 * recall_2 / (precision_2 + recall_2 + 1e-5) output_dev_result_file = os.path.join( args.output_dir, "dev_results.txt") with open(output_dev_result_file, 'a', encoding='utf-8') as f: f.write('precision:' + str(precision_0) + ' ' + str(precision_1) + ' ' + str(precision_2) + '\n') f.write('recall:' + str(recall_0) + ' ' + str(recall_1) + ' ' + str(recall_2) + '\n') f.write('f1:' + str(f10) + ' ' + str(f11) + ' ' + str(f12) + '\n' + '\n') eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) # draw loss. eval_F1.append(round(eval_accuracy, 4)) ax.append(step) plt.plot(ax, eval_F1, label='eval_F1', linewidth=1, color='r', marker='o', markerfacecolor='blue', markersize=2) for a, b in zip(ax, eval_F1): plt.text(a, b, b, ha='center', va='bottom', fontsize=8) plt.savefig(args.output_dir + '/labeled.jpg') result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("more accurate model arises, now best F1 = ", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model, only save the model it-self model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) ''' if (step+1) / int(num_train_optimization_steps/10) > 9.5: print("=" * 80) print("End of training. Saving Model......") # Save a trained model, only save the model it-self model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, "pytorch_model_final_step.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) ''' if args.do_test == 'yes': start_time = time.time() print( '___________________now testing for best eval f1 model_________________________' ) try: del model except: pass gc.collect() args.do_train = 'no' model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) model.half() for layer in model.modules(): if isinstance(layer, torch.nn.modules.batchnorm._BatchNorm): layer.float() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from " "https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False, label_name=args.label_name) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() # print('test_logits=', logits) label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) if flag == 'dev': print(flag, accuracy(logits, gold_labels)) elif flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) # df[['id', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) else: raise ValueError('flag not in [dev, test]') print('inference time usd = {}s'.format(time.time() - start_time)) '''
class Agent(): def __init__(self, args, action_space): self.action_space = action_space self.n = args.multi_step self.discount = args.discount self.target_update = args.target_update self.categorical = args.categorical self.noisy_linear = args.noisy_linear self.double_q = args.double_q self.max_grad_norm = args.max_grad_norm self.device = torch.device('cuda', args.gpu) self.num_param_updates = 0 if args.categorical: self.atoms = args.atoms self.v_min = args.v_min self.v_max = args.v_max self.support = torch.linspace( self.v_min, args.v_max, self.atoms).to(device=self.device) # Support (range) of z self.delta_z = (args.v_max - self.v_min) / (self.atoms - 1) self.online_net = DQN(args, self.action_space.n).to(device=self.device) if args.model and os.path.isfile(args.model): # Always load tensors onto CPU by default, will shift to GPU if necessary self.online_net.load_state_dict( torch.load(args.model, map_location='cpu')) self.online_net.train() self.target_net = DQN(args, self.action_space.n).to(device=self.device) self.update_target_net() self.target_net.eval() for param in self.target_net.parameters(): param.requires_grad = False self.optimizer = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps, amsgrad=True) [self.online_net, self.target_net ], self.optimizer = amp.initialize([self.online_net, self.target_net], self.optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale) if args.distributed: self.online_net = DDP(self.online_net, delay_allreduce=True) self.target_net = DDP(self.target_net, delay_allreduce=True) # Resets noisy weights in all linear layers (of online net only) def reset_noise(self): if isinstance(self.online_net, DQN): self.online_net.reset_noise() else: self.online_net.module.reset_noise() # Acts based on single state (no batch) def act(self, state): with torch.no_grad(): probs = self.online_net(state.to(self.device)) if self.categorical: probs = self.support.expand_as(probs) * probs actions = probs.sum(-1).argmax(-1).to(state.device) return actions # Acts with an ε-greedy policy (used for evaluation only) def act_e_greedy( self, state, epsilon=0.001): # High ε can reduce evaluation scores drastically actions = self.act(state) mask = torch.rand( state.size(0), device=state.device, dtype=torch.float32) < epsilon masked = mask.sum().item() if masked > 0: actions[mask] = torch.randint(0, self.action_space.n, (masked, ), device=state.device, dtype=torch.long) return actions def learn(self, states, actions, returns, next_states, nonterminals, weights): tactions = actions.unsqueeze(-1).unsqueeze(-1) if self.categorical: tactions = tactions.expand(-1, -1, self.atoms) # Calculate current state probabilities (online network noise already sampled) nvtx.range_push('agent:online (state) probs') ps = self.online_net( states, log=True) # Log probabilities log p(s_t, ·; θonline) ps_a = ps.gather(1, tactions) # log p(s_t, a_t; θonline) nvtx.range_pop() with torch.no_grad(): if isinstance(self.target_net, DQN): self.target_net.reset_noise() else: self.target_net.module.reset_noise( ) # Sample new target net noise nvtx.range_push('agent:target (next state) probs') tns = self.target_net( next_states) # Probabilities p(s_t+n, ·; θtarget) nvtx.range_pop() if self.double_q: # Calculate nth next state probabilities nvtx.range_push('agent:online (next state) probs') pns = self.online_net( next_states) # Probabilities p(s_t+n, ·; θonline) nvtx.range_pop() else: pns = tns if self.categorical: pns = self.support.expand_as( pns ) * pns # Distribution d_t+n = (z, p(s_t+n, ·; θonline)) # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))] argmax_indices_ns = pns.sum(-1).argmax(-1).unsqueeze(-1).unsqueeze( -1) if self.categorical: argmax_indices_ns = argmax_indices_ns.expand( -1, -1, self.atoms) pns_a = tns.gather( 1, argmax_indices_ns ) # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget) if self.categorical: # Compute Tz (Bellman operator T applied to z) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = returns.unsqueeze(-1) + nonterminals.float().unsqueeze( -1) * (self.discount**self.n) * self.support.unsqueeze(0) Tz = Tz.clamp(min=self.v_min, max=self.v_max) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.v_min) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64) # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (self.atoms - 1)) * (l == u)] += 1 # Distribute probability of Tz batch_size = states.size(0) m = states.new_zeros(batch_size, self.atoms) offset = torch.linspace(0, ((batch_size - 1) * self.atoms), batch_size).unsqueeze(1).expand( batch_size, self.atoms).to(actions) m.view(-1).index_add_( 0, (l + offset).view(-1), (pns_a.squeeze(1) * (u.float() - b) ).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (pns_a.squeeze(1) * (b - l.float()) ).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) else: Tz = returns + nonterminals.float() * ( self.discount**self.n) * pns_a.squeeze(-1).squeeze(-1) if self.categorical: loss = -torch.sum( m * ps_a.squeeze(1), 1) # Cross-entropy loss (minimises DKL(m||p(s_t, a_t))) weights = weights.unsqueeze(-1) else: loss = F.mse_loss(ps_a.squeeze(-1).squeeze(-1), Tz, reduction='none') nvtx.range_push('agent:loss + step') self.optimizer.zero_grad() weighted_loss = (weights * loss).mean() with amp.scale_loss(weighted_loss, self.optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.max_grad_norm) self.optimizer.step() nvtx.range_pop() return loss.detach() def update_target_net(self): self.target_net.load_state_dict(self.online_net.state_dict()) # Save model parameters on current device (don't move model between devices) def save(self, path): torch.save(self.online_net.state_dict(), os.path.join(path, 'model.pth')) # Evaluates Q-value based on single state (no batch) def evaluate_q(self, state): with torch.no_grad(): q = self.online_net(state.unsqueeze(0).to(self.device)) if self.categorical: q *= self.support return q.sum(-1).max(-1)[0].item() def train(self): self.online_net.train() def eval(self): self.online_net.eval() def __str__(self): return self.online_net.__str__()
def test(args): if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) seg_model_checkpoint = torch.load(args.eval_model) seg_model = WMSeg.from_spec(seg_model_checkpoint['spec'], seg_model_checkpoint['state_dict'], args) eval_examples = seg_model.load_data(args.eval_data_path) convert_examples_to_features = seg_model.convert_examples_to_features feature2input = seg_model.feature2input num_labels = seg_model.num_labels word2id = seg_model.word2id label_map = {v: k for k, v in seg_model.labelmap.items()} if args.fp16: seg_model.half() seg_model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") seg_model = DDP(seg_model) elif n_gpu > 1: seg_model = torch.nn.DataParallel(seg_model) seg_model.to(device) seg_model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] for start_index in tqdm(range(0, len(eval_examples), args.eval_batch_size)): eval_batch_examples = eval_examples[start_index: min(start_index + args.eval_batch_size, len(eval_examples))] eval_features = convert_examples_to_features(eval_batch_examples) input_ids, input_mask, l_mask, label_ids, matching_matrix, ngram_ids, ngram_positions, \ segment_ids, valid_ids, word_ids, word_mask = feature2input(device, eval_features) with torch.no_grad(): _, tag_seq = seg_model(input_ids, segment_ids, input_mask, labels=label_ids, valid_ids=valid_ids, attention_mask_label=l_mask, word_seq=word_ids, label_value_matrix=matching_matrix, word_mask=word_mask, input_ngram_ids=ngram_ids, ngram_position_matrix=ngram_positions) # logits = torch.argmax(F.log_softmax(logits, dim=2),dim=2) # logits = logits.detach().cpu().numpy() logits = tag_seq.to('cpu').numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == num_labels - 1: y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) y_true_all = [] y_pred_all = [] sentence_all = [] for y_true_item in y_true: y_true_all += y_true_item for y_pred_item in y_pred: y_pred_all += y_pred_item for example, y_true_item in zip(eval_examples, y_true): sen = example.text_a sen = sen.strip() sen = sen.split(' ') if len(y_true_item) != len(sen): sen = sen[:len(y_true_item)] sentence_all.append(sen) p, r, f = cws_evaluate_word_PRF(y_pred_all, y_true_all) oov = cws_evaluate_OOV(y_pred, y_true, sentence_all, word2id) print(args.eval_data_path) print("\nP: %f, R: %f, F: %f, OOV: %f" % (p, r, f, oov))
def main(): global best_prec1, args args = parse_args() if not args.amp: args.opt_level = 'O0' print0("fp32 only") print0("opt_level = {}".format(args.opt_level)) print0("keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32)) print0("loss_scale = {}".format(args.loss_scale)) # on MNIST, adam performs better than ranger default_optimizer = { 'imagenet': 'sgd', 'tiny-imagenet': 'sgd', 'mnist': 'adam', 'mnist-rot': 'ranger' } default_lr = { 'imagenet': { 'sgd': 0.1, 'ranger': 0.001, 'adam': 0.001 }, 'mnist': { 'sgd': 0.001, 'ranger': 0.0001, 'adam': 0.0001 } } default_gamma = { 'imagenet': { 'sgd': 0.1, 'ranger': 0.2, 'adam': 0.2 }, 'mnist': { 'sgd': 0.4, 'ranger': 0.4, 'adam': 0.4 } } default_gamma_epoch = {'imagenet': 20, 'mnist': 3} default_weight_decay = { 'sgd': { 'reflectnet': 0.001, 'resnet': 0.0001 }, 'ranger': { 'reflectnet': 0.1, 'resnet': 0.01 }, 'adam': { 'reflectnet': 0.1, 'resnet': 0.01 } } if args.dataset == 'mnist-rot': args.dataset2 = 'mnist' elif args.dataset == 'tiny-imagenet': args.dataset2 = 'imagenet' else: args.dataset2 = args.dataset if args.optimizer is None: args.optimizer = default_optimizer[args.dataset] if args.lr == -1: args.lr = default_lr[args.dataset2][args.optimizer] if args.gamma == -1: args.gamma = default_gamma[args.dataset2][args.optimizer] args.gamma_epoch = default_gamma_epoch[args.dataset2] cudnn.benchmark = True best_prec1 = 0 if args.deterministic: cudnn.benchmark = False cudnn.deterministic = True torch.manual_seed(args.local_rank) torch.set_printoptions(precision=10) args.distributed = False if 'WORLD_SIZE' in os.environ: args.world_size = int(os.environ['WORLD_SIZE']) args.distributed = args.world_size > 1 else: args.gpu = 0 args.world_size = 1 args.train_batch_size //= args.world_size args.test_batch_size = int(args.train_batch_size * 1.6) use_reflectnet = False use_resnet = False if re.match('^reflect', args.arch): use_reflectnet = True elif re.match('^resnet', args.arch): use_resnet = True else: print0("Only reflectnet and resnet are supported") exit(0) if args.weight_decay == -1: if use_resnet: args.weight_decay = default_weight_decay[args.optimizer]['resnet'] if use_reflectnet: args.weight_decay = default_weight_decay[ args.optimizer]['reflectnet'] if args.dataset2 == 'mnist': args.train_batch_size = 128 args.test_batch_size = 1000 args.do_pool1 = False args.num_classes = 10 args.warmup_epochs = 0 args.epochs = 10 args.normal_batch_size = args.train_batch_size elif args.dataset2 == 'imagenet': args.do_pool1 = True args.normal_batch_size = 256. if args.dataset == 'imagenet': args.num_classes = 1000 args.warmup_epochs = 5 else: args.num_classes = 200 args.warmup_epochs = 2 print0("Weight decay: %.4f. LR decay: %.4f every %d epochs" %( \ args.weight_decay, args.gamma, args.gamma_epoch)) n_gpu = torch.cuda.device_count() if args.distributed: args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() print0("n_gpu: {}, world size: {}, rank: {}, amp: {}".format( n_gpu, args.world_size, args.local_rank, args.amp)) assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." # create model if args.pretrained: print0("=> using pre-trained model '{}', {} classes".format( args.arch, args.num_classes)) if use_resnet: model = resnet.__dict__[args.arch](pretrained=True, num_classes=args.num_classes, do_pool1=args.do_pool1) else: if use_resnet: model = resnet.__dict__[args.arch](num_classes=args.num_classes, do_pool1=args.do_pool1) print0("=> creating model '{}', {} classes".format( args.arch, args.num_classes)) elif use_reflectnet: model = reflectnet.__dict__[args.arch]( num_cycles=args.cycles, num_classes=args.num_classes, cycle_trans=args.cycle_trans) print0("=> creating model '{}', cycles: {}, {} classes".format( args.arch, args.cycles, args.num_classes)) if use_reflectnet: if args.runcycles == -1: args.runcycles = args.cycles if args.runcycles != args.cycles: print0("Runtime cycles %d != model cycles %d" % (args.runcycles, args.cycles)) print0("Set all BasicBlock/Bottleneck num_cycles to %d" % args.runcycles) model.apply(set_cycles(args.runcycles)) if args.sync_bn: import apex print0("using apex synced BN") model = apex.parallel.convert_syncbn_model(model) model = model.cuda() # Scale learning rate based on global batch size args.lr = args.lr * float( args.train_batch_size * args.world_size) / args.normal_batch_size if args.freezebone: grad_names, nograd_names = {}, {} for name, value in model.named_parameters(): if re.match(r"^fc\.", name): grad_names[name] = 1 continue value.requires_grad = False nograd_names[name] = 1 sgd_optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) adam_optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) ranger_optimizer = Ranger(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.optimizer == 'sgd': optimizer = sgd_optimizer elif args.optimizer == 'adam': optimizer = adam_optimizer elif args.optimizer == 'ranger': optimizer = ranger_optimizer if not args.eval_only: print0("Optimizer =", optimizer) print0("Initial LR: %.4f" % args.lr) # Initialize Amp. Amp accepts either values or strings for the optional override arguments, # for convenient interoperation with argparse. model, optimizer = amp.initialize( model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale) # For distributed training, wrap the model with apex.parallel.DistributedDataParallel. # This must be done AFTER the call to amp.initialize. If model = DDP(model) is called # before model, ... = amp.initialize(model, ...), the call to amp.initialize may alter # the types of model's parameters in a way that disrupts or destroys DDP's allreduce hooks. if args.distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication with # computation in the backward pass. # model = DDP(model) # delay_allreduce delays all communication to the end of the backward pass. model = DDP(model, delay_allreduce=True) real_model = model.module else: real_model = model # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() # Optionally resume from a checkpoint if args.resume_cp: # Use a local scope to avoid dangling references def resume(): if os.path.isfile(args.resume_cp): print0("=> loading checkpoint '{}'... ".format(args.resume_cp)) checkpoint = torch.load( args.resume_cp, map_location=lambda storage, loc: storage.cuda(args.gpu)) if args.start_epoch == -1: args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] cp_num_classes = checkpoint['state_dict']['fc.weight'].shape[0] if cp_num_classes != args.num_classes: del checkpoint['state_dict']['fc.weight'] del checkpoint['state_dict']['fc.bias'] print0("checkpoint classes %d != current classes %d, drop FC" %( \ cp_num_classes, args.num_classes)) real_model.load_state_dict(checkpoint['state_dict'], strict=False) if checkpoint['opt_name'] == args.optimizer: optimizer.load_state_dict(checkpoint['opt_state']) else: print0("cp opt '%s' != current opt '%s'. Skip loading optimizier states" % \ (checkpoint['opt_name'], args.optimizer)) print0("loaded.") else: print0("!!! no checkpoint found at '{}'".format( args.resume_cp)) exit(0) resume() else: if args.start_epoch == -1: args.start_epoch = 0 if args.dataset == 'imagenet': train_loader, val_loader = create_imagenet_data_loader() elif args.dataset == 'tiny-imagenet': train_loader, val_loader = create_tiny_imagenet_data_loader() elif args.dataset == 'mnist': train_loader, val_loader = create_mnist_data_loader( normalize_mean_std=False, use_mnist_rot=False) elif args.dataset == 'mnist-rot': train_loader, val_loader = create_mnist_data_loader( normalize_mean_std=False, use_mnist_rot=True) # for debugging input that causes nan only if args.input_tensor: input_tensor = torch.load(args.input_tensor) model.eval() output = model(input_tensor) if args.eval_only: cp_filename = os.path.basename(args.resume_cp) validate(val_loader, model, criterion, cp_filename) return args.cp_sig = args.dataset if args.do_geo_aug: args.cp_sig += '-geo' today = date.today() today_str = today.strftime("%m%d") args.save_dir = os.path.join( os.environ['IMAGENET'], 'reflectnet', "%s-%s-%s" % (args.arch, args.cp_sig, today_str)) if args.local_rank == 0: if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) print("Models will be saved to '%s'" % args.save_dir) global board # start a new board board = SummaryWriter() for epoch in range(args.start_epoch, args.epochs): #if args.distributed: # train_sampler.set_epoch(epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, model, criterion, "Epoch %d" % epoch, epoch) # remember best prec@1 and save checkpoint if args.local_rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': real_model.state_dict(), 'best_prec1': best_prec1, 'opt_state': optimizer.state_dict(), 'opt_name': args.optimizer }, is_best, filename='%d.pth' % epoch) maxweight(real_model)
class Solver(object): def __init__(self, train_data, validation_data, test_data, model, optimizer, args): self.train_data = train_data self.validation_data = validation_data self.test_data = test_data self.args = args self.amp = amp self.ae_loss = nn.CrossEntropyLoss() self.print = False if (self.args.distributed and self.args.local_rank == 0) or not self.args.distributed: self.print = True if self.args.use_tensorboard: self.writer = SummaryWriter('logs/%s/tensorboard/' % args.log_name) self.model, self.optimizer = self.amp.initialize( model, optimizer, opt_level=args.opt_level, patch_torch_functions=args.patch_torch_functions) if self.args.distributed: self.model = DDP(self.model) self._reset() def _reset(self): self.halving = False if self.args.continue_from: checkpoint = torch.load('logs/%s/model_dict.pt' % self.args.continue_from, map_location='cpu') self.model.load_state_dict(checkpoint['model']) self.optimizer.load_state_dict(checkpoint['optimizer']) self.amp.load_state_dict(checkpoint['amp']) self.start_epoch = checkpoint['epoch'] self.prev_val_loss = checkpoint['prev_val_loss'] self.best_val_loss = checkpoint['best_val_loss'] self.val_no_impv = checkpoint['val_no_impv'] if self.print: print("Resume training from epoch: {}".format( self.start_epoch)) else: self.prev_val_loss = float("inf") self.best_val_loss = float("inf") self.val_no_impv = 0 self.start_epoch = 1 if self.print: print('Start new training') def train(self): for epoch in range(self.start_epoch, self.args.epochs + 1): self.joint_loss_weight = epoch if self.args.distributed: self.args.train_sampler.set_epoch(epoch) # Train self.model.train() start = time.time() tr_loss, tr_loss_speaker = self._run_one_epoch( data_loader=self.train_data, state='train') reduced_tr_loss = self._reduce_tensor(tr_loss) reduced_tr_loss_speaker = self._reduce_tensor(tr_loss_speaker) if self.print: print('Train Summary | End of Epoch {0} | Time {1:.2f}s | ' 'Train Loss {2:.3f}'.format(epoch, time.time() - start, reduced_tr_loss)) # Validation self.model.eval() start = time.time() with torch.no_grad(): val_loss, val_loss_speaker = self._run_one_epoch( data_loader=self.validation_data, state='val') reduced_val_loss = self._reduce_tensor(val_loss) reduced_val_loss_speaker = self._reduce_tensor( val_loss_speaker) if self.print: print('Valid Summary | End of Epoch {0} | Time {1:.2f}s | ' 'Valid Loss {2:.3f}'.format(epoch, time.time() - start, reduced_val_loss)) # test self.model.eval() start = time.time() with torch.no_grad(): test_loss, _ = self._run_one_epoch(data_loader=self.test_data, state='test') reduced_test_loss = self._reduce_tensor(test_loss) if self.print: print('Test Summary | End of Epoch {0} | Time {1:.2f}s | ' 'Test Loss {2:.3f}'.format(epoch, time.time() - start, reduced_test_loss)) # Check whether to adjust learning rate and early stop if reduced_val_loss >= self.prev_val_loss: self.val_no_impv += 1 if self.val_no_impv >= 3: self.halving = True if self.val_no_impv >= 6: if self.print: print("No imporvement for 6 epochs, early stopping.") break else: self.val_no_impv = 0 # Halfing the learning rate if self.halving: optim_state = self.optimizer.state_dict() optim_state['param_groups'][0][ 'lr'] = optim_state['param_groups'][0]['lr'] / 2 self.optimizer.load_state_dict(optim_state) if self.print: print('Learning rate adjusted to: {lr:.6f}'.format( lr=optim_state['param_groups'][0]['lr'])) self.halving = False self.prev_val_loss = reduced_val_loss if self.print: # Tensorboard logging if self.args.use_tensorboard: self.writer.add_scalar('Train loss', reduced_tr_loss, epoch) self.writer.add_scalar('Validation loss', reduced_val_loss, epoch) self.writer.add_scalar('Test loss', reduced_test_loss, epoch) self.writer.add_scalar('Validation loss speaker', reduced_val_loss_speaker, epoch) self.writer.add_scalar('Train loss speaker', reduced_tr_loss_speaker, epoch) # Save model if reduced_val_loss < self.best_val_loss: self.best_val_loss = reduced_val_loss checkpoint = { 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'amp': self.amp.state_dict(), 'epoch': epoch + 1, 'prev_val_loss': self.prev_val_loss, 'best_val_loss': self.best_val_loss, 'val_no_impv': self.val_no_impv } torch.save(checkpoint, "logs/" + self.args.log_name + "/model_dict.pt") print("Fund new best model, dict saved") def _run_one_epoch(self, data_loader, state): total_loss = 0 total_loss_speaker = 0 # total_acc_0=0 # total_acc_1=0 # total_acc_2=0 # total_acc_3=0 speaker_loss = 0 for i, (a_mix, a_tgt, v_tgt, speaker) in enumerate(data_loader): a_mix = a_mix.cuda().squeeze().float() a_tgt = a_tgt.cuda().squeeze().float() v_tgt = v_tgt.cuda().squeeze().float() speaker = speaker.cuda().squeeze() est_speaker, est_a_tgt = self.model(a_mix, v_tgt) max_snr = cal_SISNR(a_tgt, est_a_tgt) if state != 'test': sisnr_loss = 0 - torch.mean(max_snr) speaker_loss = self.ae_loss(est_speaker[0], speaker) + \ self.ae_loss(est_speaker[1], speaker) + \ self.ae_loss(est_speaker[2], speaker) + \ self.ae_loss(est_speaker[3], speaker) loss = sisnr_loss + 0.1 * speaker_loss #*np.power(0.96,self.joint_loss_weight-1) # total_acc_0 += self.cal_acc(est_speaker[0],speaker) # total_acc_1 += self.cal_acc(est_speaker[1],speaker) # total_acc_2 += self.cal_acc(est_speaker[2],speaker) # total_acc_3 += self.cal_acc(est_speaker[3],speaker) if state == 'train': self.optimizer.zero_grad() with self.amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( self.amp.master_params(self.optimizer), self.args.max_norm) self.optimizer.step() # if state == 'val': # loss = sisnr_loss else: loss = 0 - torch.mean(max_snr[::self.args.C]) total_loss += loss.data total_loss_speaker += speaker_loss # print("speaker recognition acc: %s"%str(total_acc_0/(i+1)*100)) # print("speaker recognition acc: %s"%str(total_acc_1/(i+1)*100)) # print("speaker recognition acc: %s"%str(total_acc_2/(i+1)*100)) # print("speaker recognition acc: %s"%str(total_acc_3/(i+1)*100)) return total_loss / (i + 1), total_loss_speaker / (i + 1) def _reduce_tensor(self, tensor): if not self.args.distributed: return tensor rt = tensor.clone() dist.all_reduce(rt, op=dist.ReduceOp.SUM) rt /= self.args.world_size return rt def cal_acc(self, output, target): pred = output.argmax(dim=1, keepdim=False) correct = 0 total = 0 for i in range(target.shape[0]): total += 1 if (pred[i] == target[i]): correct += 1 return correct / total
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_config.json", type=str, help="The config file which specified the model details.", ) parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--seed", type=int, default=0, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument("--num_workers", type=int, default=16, help="Number of workers in the dataloader.") parser.add_argument( "--save_name", default='', type=str, help="save name for training.", ) parser.add_argument("--use_chunk", default=0, type=float, help="whether use chunck for parallel training.") parser.add_argument("--in_memory", default=False, type=bool, help="whether use chunck for parallel training.") parser.add_argument("--optimizer", default='BertAdam', type=str, help="whether use chunck for parallel training.") parser.add_argument("--tasks", default='', type=str, help="1-2-3... training task separate by -") parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.") parser.add_argument("--vision_scratch", action="store_true", help="whether pre-trained the image or not.") parser.add_argument("--evaluation_interval", default=1, type=int, help="evaluate very n epoch.") parser.add_argument("--lr_scheduler", default='mannul', type=str, help="whether use learning rate scheduler.") parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--compact", action="store_true", help="whether use compact vilbert model.") parser.add_argument("--debug", action="store_true", help="whether in debug mode.") parser.add_argument( "--tensorboard_dir", default="tensorboard_log", type=str, help="The output directory where tensorboard log will be written.", ) parser.add_argument( "--batch_size", default=-1, type=int, help="Custom Batch size for task.", ) parser.add_argument( "--data_root", default="", type=str, help="The data root of the task.", ) args = parser.parse_args() with open('vlbert_tasks.yml', 'r') as f: task_cfg = edict(yaml.load(f)) # random.seed(args.seed) # np.random.seed(args.seed) # torch.manual_seed(args.seed) if args.baseline: from pytorch_pretrained_bert.modeling import BertConfig from vilbert.basebert import BaseBertForVLTasks elif args.compact: from vilbert.vilbert_compact import BertConfig from vilbert.vilbert_compact import VILBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] task_lr = [] for i, task_id in enumerate(args.tasks.split('-')): task = 'TASK' + task_id name = task_cfg[task]['name'] task_names.append(name) task_lr.append(task_cfg[task]['lr']) base_lr = min(task_lr) loss_scale = {} for i, task_id in enumerate(args.tasks.split('-')): task = 'TASK' + task_id loss_scale[task] = task_lr[i] / base_lr if args.save_name: prefix = '-' + args.save_name else: prefix = '' timeStamp = '-'.join(task_names) + '_' + args.config_file.split( '/')[1].split('.')[0] + prefix savePath = os.path.join(args.output_dir, timeStamp) logPath = os.path.join(args.tensorboard_dir, timeStamp) # removes everything in that directory if os.path.isdir(logPath): logger.error('Tensorboard Log path exists. Overwriting.') bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, 'command.txt'), 'w') as f: print(args, file=f) # Python 3.x print('\n', file=f) print(config, file=f) if args.batch_size != -1: for i, task_id in enumerate(args.tasks.split('-')): task = 'TASK' + task_id task_cfg[task]['batch_size'] = args.batch_size if args.data_root != "": for i, task_id in enumerate(args.tasks.split('-')): data_root = args.data_root task = 'TASK' + task_id task_cfg[task]['dataroot'] = data_root task_cfg[task]['features_h5path1'] = os.path.join( data_root, task_cfg[task]['features_h5path1'].split('/')[-1]) task_cfg[task]['features_h5path2'] = os.path.join( data_root, task_cfg[task]['features_h5path2'].split('/')[-1]) task_cfg[task]['train_annotations_jsonpath'] = os.path.join( data_root, task_cfg[task]['train_annotations_jsonpath'].split('/')[-1]) task_cfg[task]['val_annotations_jsonpath'] = os.path.join( data_root, task_cfg[task]['val_annotations_jsonpath'].split('/')[-1]) # Done it for VCR Dataset only, need to put this train_100.jsonl for other datasets if args.debug: for i, task_id in enumerate(args.tasks.split('-')): task = 'TASK' + task_id task_cfg[task]['train_annotations_jsonpath'] = '/'.join( task_cfg[task]['train_annotations_jsonpath'].split('/')[:-1] + ['train_100.jsonl']) task_cfg[task]['val_annotations_jsonpath'] = '/'.join( task_cfg[task]['val_annotations_jsonpath'].split('/')[:-1] + ['val_100.jsonl']) task_cfg[task]['batch_size'] = 4 if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Barrier to make sure only the first process in distributed training download model & vocab gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True) # Have added args.debug to only VCR Datasets (vcr_dataset.py) will need to add it to other dataset too. task_batch_size, task_num_iters, task_ids, task_datasets_train, task_datasets_val, \ task_dataloader_train, task_dataloader_val = LoadDatasets(args, task_cfg, gpt2_tokenizer, args.tasks.split('-'), args.debug) if args.local_rank == 0: torch.distributed.barrier( ) # End of barrier to make sure only the first process in distributed training download model & vocab tbLogger = utils.tbLogger(logPath, savePath, task_names, task_ids, task_num_iters, args.gradient_accumulation_steps) # if n_gpu > 0: # torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) num_train_optimization_steps = max(task_num_iters.values( )) * args.num_train_epochs // args.gradient_accumulation_steps num_labels = max( [dataset.num_labels for dataset in task_datasets_train.values()]) task_start_iter = {} task_interval = {} for task_id, num_iter in task_num_iters.items(): task_start_iter[task_id] = num_train_optimization_steps - ( task_cfg[task]['num_epoch'] * num_iter // args.gradient_accumulation_steps) task_interval[task_id] = num_train_optimization_steps // ( task_cfg[task]['num_epoch'] * num_iter // args.gradient_accumulation_steps) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Barrier to make sure only the first process in distributed training download model & vocab if args.baseline: vil_model = BaseBertForVLTasks.from_pretrained(args.from_pretrained, config, num_labels=num_labels, default_gpu=default_gpu) else: vil_model = VILBertForVLTasks.from_pretrained(args.from_pretrained, config, num_labels=num_labels, default_gpu=default_gpu) model = ViLBertGPT2(vil_model, gpt2_tokenizer, gpt2_embed_dim=768, config=config) model.to(device) if args.local_rank == 0: torch.distributed.barrier( ) # End of barrier to make sure only the first process in distributed training download model & vocab task_losses = LoadLosses(args, task_cfg, args.tasks.split('-')) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if 'embeddings' in name: bert_weight_name_filtered.append(name) elif 'encoder' in name: layer_num = name.split('.')[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) optimizer_grouped_parameters = [] lr = args.learning_rate for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if 'vil_prediction' in key: # if args.learning_rate <= 2e-5: lr = 1e-4 else: if args.vision_scratch: if key[12:] in bert_weight_name: lr = args.learning_rate else: lr = 1e-4 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) max_num_iter = max(task_num_iters.values()) max_batch_size = max(task_batch_size.values()) if args.optimizer == 'BertAdam': optimizer = BertAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, schedule='warmup_constant', ) elif args.optimizer == 'Adam': optimizer = Adam( optimizer_grouped_parameters, lr=base_lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, schedule='warmup_constant', ) elif args.optimizer == 'Adamax': optimizer = Adamax( optimizer_grouped_parameters, lr=base_lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, schedule='warmup_constant', ) if args.lr_scheduler == 'automatic': lr_scheduler = ReduceLROnPlateau(optimizer, \ mode='max', factor=0.2, patience=1, cooldown=1, threshold=0.001) elif args.lr_scheduler == 'mannul': lr_reduce_list = np.array([12, 16]) # lr_reduce_list = np.array([6, 8, 10]) def lr_lambda_fun(epoch): return pow(0.1, np.sum(lr_reduce_list <= epoch)) lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun) if default_gpu: print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) print(" Num steps: %d" % num_train_optimization_steps) startIterID = 0 # TODO # initialize the data iteration. task_iter_train = {name: None for name in task_ids} task_count = {name: 0 for name in task_ids} for epochId in tqdm(range(args.num_train_epochs), desc="Epoch"): model.train() freeze = -1 for step in range(max_num_iter): iterId = startIterID + step + (epochId * max_num_iter) for task_id in task_ids: if iterId >= task_start_iter[task_id]: # if iterId % task_interval[task_id] == 0: loss_vl, gpt2_loss, score = ForwardModelsTrain( args, task_cfg, device, task_id, task_count, task_iter_train, task_dataloader_train, model, task_losses, task_start_iter, freeze=freeze) loss = loss_vl + gpt2_loss loss = loss * loss_scale[task_id] loss_vl = loss_vl * loss_scale[task_id] gpt2_loss = gpt2_loss * loss_scale[task_id] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss_vl = loss_vl / args.gradient_accumulation_steps gpt2_loss = gpt2_loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() if default_gpu: tbLogger.step_train(epochId, iterId, float(loss), float(loss_vl), float(gpt2_loss), float(score), optimizer.show_lr(), task_id, 'train') freeze = freeze * -1 if step % (20 * args.gradient_accumulation_steps ) == 0 and step != 0 and default_gpu: tbLogger.showLossTrain() model.eval() # when run evaluate, we run each task sequentially. for task_id in task_ids: num_batch_10 = int(0.1 * len(task_dataloader_val[task_id])) if args.debug: num_batch_10 = 1 for i, batch in enumerate(task_dataloader_val[task_id]): # generate if i % num_batch_10 == 0: generate = True loss_vl, gpt2_loss, score, batch_size, bleu_score = ForwardModelsVal( args, task_cfg, device, task_id, batch, model, task_losses, generate=generate) else: generate = False loss_vl, gpt2_loss, score, batch_size = ForwardModelsVal( args, task_cfg, device, task_id, batch, model, task_losses, generate=generate) loss = loss_vl + gpt2_loss tbLogger.step_val(epochId, float(loss), float(loss_vl), float(gpt2_loss), float(score), bleu_score, task_id, batch_size, 'val') if default_gpu: sys.stdout.write('%d/%d\r' % (i, len(task_dataloader_val[task_id]))) sys.stdout.flush() ave_score = tbLogger.showLossVal() if args.lr_scheduler == 'automatic': lr_scheduler.step(ave_score) logger.info("best average score is %3f" % lr_scheduler.best) else: lr_scheduler.step() if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model on " + logPath + "** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self if not os.path.exists(savePath): os.makedirs(savePath) output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin") torch.save(model_to_save.state_dict(), output_model_file) tbLogger.txt_close()
class DDPTrainer(): """Main class for data parallel training. This class supports data parallel training, where multiple workers each have a full model replica and gradients are accumulated synchronously via torch.distributed.all_reduce. """ def __init__(self, args, model): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') self.args = args self.model = model.cuda() self.criterion = CRITERION_REGISTRY[args.criterion](args).cuda() self.optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self.scaler = amp.GradScaler(enabled=self.args.amp, init_scale=2**15) if self.args.distributed_world_size > 1: self.model = DDP(model) self._buffered_stats = defaultdict(lambda: []) self._num_updates = 0 self._optim_history = None self.throughput_meter = TimeMeter() self.avg_loss_meter = AverageMeter() def save_checkpoint(self, filename, extra_state): """Save all training state in a checkpoint file.""" if distributed_utils.is_master(self.args): # only save one checkpoint utils.save_state( filename, self.args, self.get_model(), self.criterion, self.optimizer, self.lr_scheduler, self._num_updates, self._optim_history, extra_state, ) def load_checkpoint(self, filename, load_optim=True): """Load all training state from a checkpoint file.""" extra_state, optim_history, last_optim_state = \ utils.load_model_state(filename, self.get_model()) if last_optim_state is not None: # rebuild optimizer after loading model, since params may have changed #self.optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) if load_optim: self._optim_history = optim_history # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] if last_optim[ 'criterion_name'] == self.criterion.__class__.__name__: self.lr_scheduler.load_state_dict( last_optim['lr_scheduler_state']) if last_optim[ 'optimizer_name'] == self.optimizer.__class__.__name__: self.optimizer.load_state_dict(last_optim_state) self._num_updates = last_optim['num_updates'] return extra_state def train_step(self, sample, update_params=True, last_step=False): """Do forward, backward and parameter update.""" # Set seed based on args.seed and the update number so that we get # reproducible results when resuming from checkpoints seed = self.args.seed + self.get_num_updates() torch.manual_seed(seed) torch.cuda.manual_seed(seed) self.model.train() if isinstance(self.model, DDP): if last_step: self.model.disable_allreduce() else: self.model.enable_allreduce() # forward and backward pass sample = self._prepare_sample(sample) loss, oom_fwd = self._forward(sample) # If this is a last batch forward pass is skipped on some workers # Batch with sample_size 0 is not accounted for in weighted loss logging_output = { 'ntokens': sample['ntokens'] if sample is not None else 0, 'nsentences': sample['target'].size(0) if sample is not None else 0, 'loss': utils.item(loss.data) if loss is not None else 0, } sample_size = sample['ntokens'] if sample is not None else 0 oom_bwd = self._backward(loss) # buffer stats and logging outputs self._buffered_stats['sample_sizes'].append(sample_size) self._buffered_stats['logging_outputs'].append(logging_output) self._buffered_stats['ooms_fwd'].append(oom_fwd) self._buffered_stats['ooms_bwd'].append(oom_bwd) # update parameters if update_params and not last_step: # gather logging outputs from all replicas sample_sizes = self._buffered_stats['sample_sizes'] logging_outputs = self._buffered_stats['logging_outputs'] ooms_fwd = self._buffered_stats['ooms_fwd'] ooms_bwd = self._buffered_stats['ooms_bwd'] if self.args.distributed_world_size > 1: sample_sizes, logging_outputs, ooms_fwd, ooms_bwd = map( lambda l: list(chain.from_iterable(l)), zip(*distributed_utils.all_gather_list((sample_sizes, logging_outputs, ooms_fwd, ooms_bwd)))) ooms_fwd = sum(ooms_fwd) ooms_bwd = sum(ooms_bwd) ooms = ooms_fwd + ooms_bwd # this is always <= distributed_world_size if ooms == self.args.distributed_world_size: print('| WARNING: OOM in all workers, skipping batch') self.zero_grad() return # aggregate stats and logging outputs grad_denom = sum(sample_sizes) for p in self.model.parameters(): if p.requires_grad and p.grad is not None: p.grad /= grad_denom self._opt() # Handle logging ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) self.throughput_meter.update(ntokens) info_log_data = { 'tokens/s': self.throughput_meter.avg, 'tokens': ntokens, 'loss': sum(log.get('loss', 0) for log in logging_outputs) / ntokens / math.log(2) } self.avg_loss_meter.update(info_log_data['loss']) debug_log_data = { 'batch_size': sum(log.get('nsentences', 0) for log in logging_outputs), 'lr': self.get_lr(), 'grad_denom': grad_denom, 'updates': 1 } DLLogger.log(step=self._num_updates, data=info_log_data, verbosity=0) DLLogger.log(step=self._num_updates, data=debug_log_data, verbosity=1) self.clear_buffered_stats() def _forward(self, sample): loss = None oom = 0 try: if sample is not None: with amp.autocast(enabled=self.args.amp): # calculate loss and sample size logits, _ = self.model(**sample['net_input']) target = sample['target'] probs = F.log_softmax(logits, dim=-1, dtype=torch.float32) loss = self.criterion(probs, target) except RuntimeError as e: if 'out of memory' in str(e): print( '| WARNING: ran out of memory in worker {}, skipping batch' .format(self.args.distributed_rank), force=True) oom = 1 loss = None else: raise e return loss, oom def _backward(self, loss): oom = 0 if loss is not None: try: self.scaler.scale(loss).backward() except RuntimeError as e: if 'out of memory' in str(e): print( '| WARNING: ran out of memory in worker {}, skipping batch' .format(self.args.distributed_rank), force=True) oom = 1 self.zero_grad() else: raise e return oom def _opt(self): # take an optimization step self.scaler.step(self.optimizer.optimizer) self.scaler.update() self.zero_grad() self._num_updates += 1 # update learning rate self.lr_scheduler.step_update(self._num_updates) def valid_step(self, sample): """Do forward pass in evaluation mode.""" self.model.eval() # forward pass sample = self._prepare_sample(sample) with torch.no_grad(): loss, oom_fwd = self._forward(sample) logging_output = { 'ntokens': sample['ntokens'] if sample is not None else 0, 'nsentences': sample['target'].size(0) if sample is not None else 0, } loss = loss.item() if loss is not None else 0 assert not oom_fwd, 'Ran out of memory during validation' # gather logging outputs from all GPUs if self.args.distributed_world_size > 1: losses, logging_outputs = zip( *distributed_utils.all_gather_list((loss, logging_output))) else: losses = [loss] logging_outputs = [logging_output] weight = sum(log.get('ntokens', 0) for log in logging_outputs) scaled_loss = sum(losses) / weight / math.log(2) return scaled_loss def dummy_train_step(self, dummy_batch): """Dummy training step for warming caching allocator.""" self.train_step(dummy_batch, update_params=False) self.zero_grad() self.clear_buffered_stats() def zero_grad(self): self.optimizer.zero_grad() def clear_buffered_stats(self): self._buffered_stats.clear() def lr_step(self, epoch, val_loss=None): """Adjust the learning rate based on the validation loss.""" return self.lr_scheduler.step(epoch, val_loss) def lr_step_update(self, num_updates): """Update the learning rate after each update.""" return self.lr_scheduler.step_update(num_updates) def get_lr(self): """Get the current learning rate.""" return self.optimizer.get_lr() def get_throughput_meter(self): """Get the throughput meter""" return self.throughput_meter def get_model(self): """Get the model replica.""" return self.model.module if isinstance(self.model, DDP) else self.model def get_num_updates(self): """Get the number of parameters updates.""" return self._num_updates def _prepare_sample(self, sample): if not sample: return None return utils.move_to_cuda(sample)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, "adlhw2": MyTaskProcessor } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", "adlhw2": "classification" } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default='../absa_data/twitter', type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default='twitter', type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=64, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--max_entity_length", default=16, type=int, help= "The maximum entity input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=16, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=8.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--fine_tune_cnn', action='store_true', help='fine tune pre-trained CNN if True') parser.add_argument('--resnet_root', default='./resnet', help='path the pre-trained cnn models') parser.add_argument('--crop_size', type=int, default=224, help='crop size of image') parser.add_argument( '--path_image', default='../pytorch-pretrained-BERT/twitter_subimages/', help='path to images') parser.add_argument( '--mm_model', default='TomBert', help='model name' ) # TomBert, TomBertNoPooling, MBert, MBertNoPooling, ResBert parser.add_argument('--pooling', default='first', help='pooling method') # first, cls, concat parser.add_argument('--bertlayer', action='store_true', help='whether to add another bert layer') parser.add_argument('--tfn', action='store_true', help='whether to use TFN') args = parser.parse_args() print("**************current model: " + args.mm_model + "******************") if args.mm_model == "ResBert" and args.bertlayer: print("add another bert layer") if args.mm_model == "ResBert" and args.tfn: print("add another tfn layer") elif args.mm_model == "TomBert" or args.mm_model == "MBert": print("pooling method: " + args.pooling) print("*" * 50) if args.task_name == "twitter": # this refers to twitter-2017 dataset args.path_image = "../IJCAI2019_data/twitter2017_images/" elif args.task_name == "twitter2015": # this refers to twitter-2015 dataset args.path_image = "../IJCAI2019_data/twitter2015_images/" else: print("The task name is not right!") processors = { "twitter2015": AbmsaProcessor, # our twitter-2015 dataset "twitter": AbmsaProcessor # our twitter-2017 dataset } num_labels_task = { "twitter2015": 3, # our twitter-2015 dataset "twitter": 3 # our twitter-2017 dataset } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if args.mm_model == 'ResBert': model = ResBertForMMSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels, bert_layer=args.bertlayer, tfn=args.tfn) elif args.mm_model == 'MBert': model = MBertForMMSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels, pooling=args.pooling) elif args.mm_model == 'MBertNoPooling': model = MBertNoPoolingForMMSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels) elif args.mm_model == 'TomBertNoPooling': model = TomBertNoPoolingForMMSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels) else: # TomBert by default model = TomBertForMMSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels, pooling=args.pooling) net = getattr(resnet, 'resnet152')() net.load_state_dict( torch.load(os.path.join(args.resnet_root, 'resnet152.pth'))) encoder = myResnet(net, args.fine_tune_cnn, device) if args.fp16: model.half() encoder.half() model.to(device) encoder.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) encoder = DDP(encoder) elif n_gpu > 1: model = torch.nn.DataParallel(model) encoder = torch.nn.DataParallel(encoder) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 nb_tr_steps = 0 tr_loss = 0 output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") output_encoder_file = os.path.join(args.output_dir, "pytorch_encoder.bin") if args.do_train: train_features = convert_mm_examples_to_features( train_examples, label_list, args.max_seq_length, args.max_entity_length, tokenizer, args.crop_size, args.path_image) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_added_input_mask = torch.tensor( [f.added_input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_s2_input_ids = torch.tensor( [f.s2_input_ids for f in train_features], dtype=torch.long) all_s2_input_mask = torch.tensor( [f.s2_input_mask for f in train_features], dtype=torch.long) all_s2_segment_ids = torch.tensor( [f.s2_segment_ids for f in train_features], dtype=torch.long) all_img_feats = torch.stack([f.img_feat for f in train_features]) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_added_input_mask, all_segment_ids,\ all_s2_input_ids, all_s2_input_mask, all_s2_segment_ids, all_img_feats, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) #''' eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_mm_examples_to_features( eval_examples, label_list, args.max_seq_length, args.max_entity_length, tokenizer, args.crop_size, args.path_image) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_added_input_mask = torch.tensor( [f.added_input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_s2_input_ids = torch.tensor( [f.s2_input_ids for f in eval_features], dtype=torch.long) all_s2_input_mask = torch.tensor( [f.s2_input_mask for f in eval_features], dtype=torch.long) all_s2_segment_ids = torch.tensor( [f.s2_segment_ids for f in eval_features], dtype=torch.long) all_img_feats = torch.stack([f.img_feat for f in eval_features]) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_added_input_mask, all_segment_ids, \ all_s2_input_ids, all_s2_input_mask, all_s2_segment_ids,\ all_img_feats, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) max_acc = 0.0 #''' logger.info("*************** Running training ***************") for train_idx in trange(int(args.num_train_epochs), desc="Epoch"): logger.info("********** Epoch: " + str(train_idx) + " **********") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() encoder.train() encoder.zero_grad() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, added_input_mask, segment_ids, s2_input_ids, s2_input_mask, s2_segment_ids, \ img_feats, label_ids = batch with torch.no_grad(): imgs_f, img_mean, img_att = encoder(img_feats) if train_idx == 0 and step == 0: loss = model(input_ids, s2_input_ids, img_att, segment_ids, s2_segment_ids, input_mask, s2_input_mask, \ added_input_mask, label_ids, True) else: loss = model(input_ids, s2_input_ids, img_att, segment_ids, s2_segment_ids, input_mask, s2_input_mask, \ added_input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 logger.info("***** Running evaluation on Dev Set*****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() encoder.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 true_label_list = [] pred_label_list = [] for input_ids, input_mask, added_input_mask, segment_ids, s2_input_ids, s2_input_mask, s2_segment_ids, \ img_feats, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) added_input_mask = added_input_mask.to(device) segment_ids = segment_ids.to(device) s2_input_ids = s2_input_ids.to(device) s2_input_mask = s2_input_mask.to(device) s2_segment_ids = s2_segment_ids.to(device) img_feats = img_feats.to(device) label_ids = label_ids.to(device) with torch.no_grad(): imgs_f, img_mean, img_att = encoder(img_feats) tmp_eval_loss = model(input_ids, s2_input_ids, img_att, segment_ids, s2_segment_ids, input_mask, s2_input_mask, added_input_mask, label_ids) logits = model(input_ids, s2_input_ids, img_att, segment_ids, s2_segment_ids, input_mask, s2_input_mask, added_input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() true_label_list.append(label_ids) pred_label_list.append(logits) tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None true_label = np.concatenate(true_label_list) pred_outputs = np.concatenate(pred_label_list) precision, recall, F_score = macro_f1(true_label, pred_outputs) result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'f_score': F_score, 'global_step': global_step, 'loss': loss } logger.info("***** Dev Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) if eval_accuracy >= max_acc: # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self encoder_to_save = encoder.module if hasattr( encoder, 'module') else encoder # Only save the model it-self if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) torch.save(encoder_to_save.state_dict(), output_encoder_file) max_acc = eval_accuracy # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) if args.mm_model == 'ResBert': model = ResBertForMMSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels, bert_layer=args.bertlayer, tfn=args.tfn) elif args.mm_model == 'MBert': model = MBertForMMSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels, pooling=args.pooling) elif args.mm_model == 'MBertNoPooling': model = MBertNoPoolingForMMSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) elif args.mm_model == 'TomBertNoPooling': model = TomBertNoPoolingForMMSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) else: # TomBert by default model = TomBertForMMSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels, pooling=args.pooling) model.to(device) encoder_state_dict = torch.load(output_encoder_file) encoder.load_state_dict(encoder_state_dict) encoder.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_mm_examples_to_features( eval_examples, label_list, args.max_seq_length, args.max_entity_length, tokenizer, args.crop_size, args.path_image) logger.info("***** Running evaluation on Test Set*****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_added_input_mask = torch.tensor( [f.added_input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_s2_input_ids = torch.tensor( [f.s2_input_ids for f in eval_features], dtype=torch.long) all_s2_input_mask = torch.tensor( [f.s2_input_mask for f in eval_features], dtype=torch.long) all_s2_segment_ids = torch.tensor( [f.s2_segment_ids for f in eval_features], dtype=torch.long) all_img_feats = torch.stack([f.img_feat for f in eval_features]) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_added_input_mask, all_segment_ids, \ all_s2_input_ids, all_s2_input_mask, all_s2_segment_ids, all_img_feats, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() encoder.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 true_label_list = [] pred_label_list = [] for input_ids, input_mask, added_input_mask, segment_ids, s2_input_ids, s2_input_mask, s2_segment_ids, \ img_feats, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) added_input_mask = added_input_mask.to(device) segment_ids = segment_ids.to(device) s2_input_ids = s2_input_ids.to(device) s2_input_mask = s2_input_mask.to(device) s2_segment_ids = s2_segment_ids.to(device) img_feats = img_feats.to(device) label_ids = label_ids.to(device) with torch.no_grad(): imgs_f, img_mean, img_att = encoder(img_feats) tmp_eval_loss = model(input_ids, s2_input_ids, img_att, segment_ids, s2_segment_ids, input_mask, s2_input_mask, added_input_mask, label_ids) logits = model(input_ids, s2_input_ids, img_att, segment_ids, s2_segment_ids, input_mask, s2_input_mask, added_input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() true_label_list.append(label_ids) pred_label_list.append(logits) tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None true_label = np.concatenate(true_label_list) pred_outputs = np.concatenate(pred_label_list) precision, recall, F_score = macro_f1(true_label, pred_outputs) result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'precision': precision, 'recall': recall, 'f_score': F_score, 'global_step': global_step, 'loss': loss } pred_label = np.argmax(pred_outputs, axis=-1) fout_p = open(os.path.join(args.output_dir, "pred.txt"), 'w') fout_t = open(os.path.join(args.output_dir, "true.txt"), 'w') for i in range(len(pred_label)): attstr = str(pred_label[i]) fout_p.write(attstr + '\n') for i in range(len(true_label)): attstr = str(true_label[i]) fout_t.write(attstr + '\n') fout_p.close() fout_t.close() output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Test Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument('--device_id', type=str, default='0', help="The CUDA device is for training.") args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.device_id # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=2) # Prepare model model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.do_eval and (step + 1) % ( args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) if args.do_test: del model gc.collect() args.do_train = False model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] #df['label_2']=logits[:,2] df[['id', 'entity', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "test_pb.csv"), index=False)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = read_swag_examples(os.path.join( args.data_dir, 'train.csv'), is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_choices=4) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForMultipleChoice.from_pretrained(args.bert_model, state_dict=model_state_dict, num_choices=4) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join( args.data_dir, 'val.csv'), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
class Trainer: def __init__(self, args): self.args = args self.start_epoch = 1 self.epochs = self.args.epochs self.best_pred = 0 self.Metric = namedtuple('Metric', 'pixacc miou kappa') # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) # Define Dataloader train_set, val_set, self.num_classes = make_data_loader( dataset_name=self.args.dataset, base_size=self.args.base_size, crop_size=self.args.crop_size, ) self.in_c = train_set.in_c self.mean = train_set.mean self.std = train_set.std train_sampler = None val_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_set) val_sampler = torch.utils.data.distributed.DistributedSampler( val_set) self.train_loader = DataLoader(train_set, batch_size=self.args.batch_size, shuffle=(train_sampler is None), pin_memory=True, num_workers=self.args.num_workers, sampler=train_sampler) self.val_loader = DataLoader(val_set, batch_size=self.args.val_batch_size, shuffle=False, pin_memory=True, num_workers=self.args.num_workers, sampler=val_sampler) # Define network print(f"=> creating model '{self.args.model}'", end=": ") self.model = get_model(model_name=self.args.model, backbone=self.args.backbone, num_classes=self.num_classes, in_c=self.in_c) print('Total params: %.2fM' % (sum(p.numel() for p in self.model.parameters()) / 1000000.0)) print(f"=> creating optimizer '{self.args.optim}'") self.optimizer = get_optimizer(optim_name=self.args.optim, parameters=self.model.parameters(), lr=self.args.lr) if self.args.check_point_id is not None: print( f"=> reload parameter from experiment_{self.args.check_point_id}" ) self.best_pred, self.start_epoch, model_state_dict, optimizer_state_dict = self.saver.load_checkpoint( ) # pprint(model_state_dict.keys()) self.model.load_state_dict(model_state_dict, strict=False) self.optimizer.load_state_dict(optimizer_state_dict) # self.criterion = loss.CrossEntropyLossWithOHEM( 0.7 ) print(f"=> creating criterion 'CrossEntropyLoss'") self.criterion = torch.nn.CrossEntropyLoss(ignore_index=255) print(f"=> creating scheduler 'ReduceLROnPlateau'") self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'max', factor=0.8, patience=3, verbose=True) if args.sync_bn: print("=> using apex synced BN") self.model = apex.parallel.convert_syncbn_model(self.model) print("\n=> apex\n") self.model = self.model.cuda() self.criterion.cuda() # Initialize Amp. Amp accepts either values or strings for the optional override arguments, # for convenient interoperation with argparse. self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level=self.args.opt_level, keep_batchnorm_fp32=self.args.keep_batchnorm_fp32, loss_scale=self.args.loss_scale) # For distributed training, wrap the model with apex.parallel.DistributedDataParallel. # This must be done AFTER the call to amp.initialize. If model = DDP(model) is called # before model, ... = amp.initialize(model, ...), the call to amp.initialize may alter # the types of model's parameters in a way that disrupts or destroys DDP's allreduce hooks. if args.distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication with # computation in the backward pass. # model = DDP(model) # delay_allreduce delays all communication to the end of the backward pass. self.model = DDP(self.model, delay_allreduce=True) self.train_metric = self.Metric(miou=metric.MeanIoU(self.num_classes), pixacc=metric.PixelAccuracy(), kappa=metric.Kappa(self.num_classes)) self.valid_metric = self.Metric(miou=metric.MeanIoU(self.num_classes), pixacc=metric.PixelAccuracy(), kappa=metric.Kappa(self.num_classes)) def training(self, epoch): self.train_metric.miou.reset() self.train_metric.kappa.reset() self.train_metric.pixacc.reset() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() end = time.time() num_img_tr = len(self.train_loader) bar = Bar('Processing', max=num_img_tr) self.model.train() for batch_idx, sample in enumerate(self.train_loader): image, target = sample['image'], sample['label'] data_time.update(time.time() - end) if self.args.cuda: image, target = image.cuda(), target.cuda() self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) self.train_metric.miou.update(output, target) self.train_metric.kappa.update(output, target) self.train_metric.pixacc.update(output, target) if self.args.apex: from apex import amp with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # if hasattr(torch.cuda, 'empty_cache'): # torch.cuda.empty_cache() self.optimizer.step() losses.update(loss.item()) self.summary.writer.add_scalar('total_loss_iter', losses.avg, batch_idx + num_img_tr * epoch) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # plot progress bar.suffix = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | Acc: {Acc: .4f} | mIoU: {mIoU: .4f}'.format( batch=batch_idx + 1, size=len(self.train_loader), data=data_time.avg, bt=batch_time.avg, total=bar.elapsed_td, eta=bar.eta_td, loss=losses.avg, mIoU=self.train_metric.miou.get(), Acc=self.train_metric.pixacc.get(), ) bar.next() bar.finish() self.summary.writer.add_scalar("learning_rate", self.optimizer.param_groups[0]['lr'], epoch) self.summary.writer.add_scalars('metric/loss_epoch', {"train": losses.avg}, epoch) self.summary.writer.add_scalars( 'metric/mIoU', {"train": self.train_metric.miou.get()}, epoch) self.summary.writer.add_scalars( 'metric/Acc', {"train": self.train_metric.pixacc.get()}, epoch) self.summary.writer.add_scalars( 'metric/kappa', {"train": self.train_metric.kappa.get()}, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, num_img_tr * self.args.batch_size)) print('Train Loss: %.3f' % losses.avg) def validation(self, epoch): self.valid_metric.miou.reset() self.valid_metric.kappa.reset() self.valid_metric.pixacc.reset() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() end = time.time() num_img_tr = len(self.val_loader) bar = Bar('Processing', max=num_img_tr) self.model.eval() for batch_idx, sample in enumerate(self.val_loader): image, target = sample['image'], sample['label'] data_time.update(time.time() - end) if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) losses.update(loss.item()) self.valid_metric.miou.update(output, target) self.valid_metric.kappa.update(output, target) self.valid_metric.pixacc.update(output, target) self.visualize_batch_image(image, target, output, epoch, batch_idx) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # plot progress bar.suffix = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | Acc: {Acc: .4f} | mIoU: {mIoU: .4f}'.format( batch=batch_idx + 1, size=len(self.train_loader), data=data_time.avg, bt=batch_time.avg, total=bar.elapsed_td, eta=bar.eta_td, loss=losses.avg, mIoU=self.valid_metric.miou.get(), Acc=self.valid_metric.pixacc.get(), ) bar.next() bar.finish() # Fast test during the training new_pred = self.valid_metric.miou.get() metric_str = "Acc:{:.4f}, mIoU:{:.4f}, kappa: {:.4f}".format( self.valid_metric.pixacc.get(), new_pred, self.valid_metric.kappa.get()) self.summary.writer.add_scalars('metric/loss_epoch', {"valid": losses.avg}, epoch) self.summary.writer.add_scalars('metric/mIoU', {"valid": new_pred}, epoch) self.summary.writer.add_scalars( 'metric/Acc', {"valid": self.valid_metric.pixacc.get()}, epoch) self.summary.writer.add_scalars( 'metric/kappa', {"valid": self.valid_metric.kappa.get()}, epoch) print('Validation:') print( f"[Epoch: {epoch}, numImages: {num_img_tr * self.args.batch_size}]" ) print(f'Valid Loss: {losses.avg:.4f}') is_best = new_pred > self.best_pred self.best_pred = max(new_pred, self.best_pred) train_metric_str = "Acc:{:.4f}, mIoU:{:.4f}, kappa: {:.4f}".format( self.train_metric.pixacc.get(), self.train_metric.miou.get(), self.train_metric.kappa.get()) save_message = f"train\t\t{train_metric_str}\t\t val\t\t{metric_str}" if self.args.local_rank is 0: self.saver.save_checkpoint( { 'epoch': epoch, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best, save_message) return new_pred def auto_reset_learning_rate(self): if self.optimizer.param_groups[0]['lr'] <= 1e-4: for param_group in self.optimizer.param_groups: param_group['lr'] = self.args.lr def get_lr(self): return self.optimizer.param_groups[0]['lr'] def visualize_batch_image(self, image, target, output, epoch, batch_index): # image (B,C,H,W) To (B,H,W,C) image_np = image.cpu().numpy() image_np = np.transpose(image_np, axes=[0, 2, 3, 1]) image_np *= self.std image_np += self.mean image_np *= 255.0 image_np = image_np.astype(np.uint8) # target (B,H,W) target = target.cpu().numpy() # output (B,C,H,W) to (B,H,W) output = torch.argmax(output, dim=1).cpu().numpy() for i in range(min(3, image_np.shape[0])): img_tmp = image_np[i] img_rgb_tmp = np.array( Image.fromarray(img_tmp).convert("RGB")).astype(np.uint8) target_rgb_tmp = decode_segmap(target[i], self.num_classes).astype(np.uint8) output_rgb_tmp = decode_segmap(output[i], self.num_classes).astype(np.uint8) plt.figure() plt.title('display') plt.subplot(131) plt.imshow(img_rgb_tmp, vmin=0, vmax=255) plt.subplot(132) plt.imshow(target_rgb_tmp, vmin=0, vmax=255) plt.subplot(133) plt.imshow(output_rgb_tmp, vmin=0, vmax=255) path = os.path.join(self.saver.experiment_dir, "vis_image", f'epoch_{epoch}') make_sure_path_exists(path) plt.savefig(f"{path}/{batch_index}-{i}.jpg") plt.close('all')
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--vocab_file", default='bert-base-uncased-vocab.txt', type=str, required=True) parser.add_argument("--model_file", default='bert-base-uncased.tar.gz', type=str, required=True) parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") parser.add_argument("--predict_dir", default=None, type=str, required=True, help="The output directory where the predictions will be written.") parser.add_argument('--predict_output_file', default='predictions.json', type=str) # Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=2.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", default=False, action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") # Base setting parser.add_argument('--pretrain', type=str, default=None) parser.add_argument('--max_ctx', type=int, default=2) parser.add_argument('--task_name', type=str, default='coqa_yesno') parser.add_argument('--bert_name', type=str, default='baseline') parser.add_argument('--reader_name', type=str, default='coqa') # model parameters parser.add_argument('--evidence_lambda', type=float, default=0.8) # Parameters for running labeling model parser.add_argument('--do_label', default=False, action='store_true') parser.add_argument('--sentence_id_files', nargs='*') parser.add_argument('--weight_threshold', type=float, default=0.0) parser.add_argument('--label_threshold', type=float, default=0.0) args = parser.parse_args() logger = setting_logger(args.output_dir) logger.info('================== Program start. ========================') # model parameters model_params = prepare_model_params(args) read_params = prepare_read_params(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if args.do_train: if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) if args.do_predict: os.makedirs(args.predict_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.vocab_file) data_reader = initialize_reader(args.reader_name) num_train_steps = None if args.do_train or args.do_label: train_examples = data_reader.read(input_file=args.train_file, **read_params) cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}_{4}_{5}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), str(args.max_ctx), str(args.task_name)) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except FileNotFoundError: train_features = data_reader.convert_examples_to_features(examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) num_train_steps = int(len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if args.pretrain is not None: logger.info('Load pretrained model from {}'.format(args.pretrain)) model_state_dict = torch.load(args.pretrain, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) else: model = initialize_model(args.bert_name, args.model_file, **model_params) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) # Prepare data eval_examples = data_reader.read(input_file=args.predict_file, **read_params) eval_features = data_reader.convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) eval_tensors = data_reader.data_to_tensors(eval_features) eval_data = TensorDataset(*eval_tensors) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) if args.do_train: if args.do_label: logger.info('Training in State Wise.') raise RuntimeError("Not Implement Error.") else: logger.info('Training in traditional way.') logger.info("Start training") train_loss = AverageMeter() best_acc = 0.0 summary_writer = SummaryWriter(log_dir=args.output_dir) global_step = 0 eval_loss = AverageMeter() train_tensors = data_reader.data_to_tensors(train_features) train_data = TensorDataset(*train_tensors) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for epoch in trange(int(args.num_train_epochs), desc="Epoch"): # Train model.train() for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = batch_to_device(batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs(batch, model_state=ModelState.Train) loss, _ = model(**inputs) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 train_loss.update(loss.item(), args.train_batch_size) summary_writer.add_scalar('train_loss', train_loss.avg, global_step) # Evaluation model.eval() all_results = [] logger.info("Start evaluating") for eval_step, batch in enumerate(tqdm(eval_dataloader, desc="Evaluating")): if n_gpu == 1: batch = batch_to_device(batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs(batch, model_state=ModelState.Evaluate) with torch.no_grad(): loss, batch_choice_logits = model(**inputs) eval_loss.update(loss.item(), args.predict_batch_size) summary_writer.add_scalar('eval_loss', eval_loss.avg, epoch * len(eval_dataloader) + eval_step) example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResultChoice(unique_id=unique_id, choice_logits=choice_logits)) data_reader.write_predictions(eval_examples, eval_features, all_results, None, null_score_diff_threshold=0.0) yes_metric = data_reader.yesno_cate.f1_measure('yes', 'no') no_metric = data_reader.yesno_cate.f1_measure('no', 'yes') current_acc = yes_metric['accuracy'] summary_writer.add_scalar('eval_yes_f1', yes_metric['f1'], epoch) summary_writer.add_scalar('eval_yes_recall', yes_metric['recall'], epoch) summary_writer.add_scalar('eval_yes_precision', yes_metric['precision'], epoch) summary_writer.add_scalar('eval_no_f1', no_metric['f1'], epoch) summary_writer.add_scalar('eval_no_recall', no_metric['recall'], epoch) summary_writer.add_scalar('eval_no_precision', no_metric['precision'], epoch) summary_writer.add_scalar('eval_yesno_acc', current_acc, epoch) torch.cuda.empty_cache() if current_acc > best_acc: best_acc = current_acc model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) logger.info('Epoch: %d, Accuracy: %f (Best Accuracy: %f)' % (epoch, current_acc, best_acc)) data_reader.yesno_cate.reset() summary_writer.close() # Loading trained model. output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") model_state_dict = torch.load(output_model_file, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) model.to(device) # Write Yes/No predictions if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = eval_examples test_features = eval_features test_tensors = data_reader.data_to_tensors(test_features) test_data = TensorDataset(*test_tensors) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.predict_batch_size) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(test_examples)) logger.info(" Num split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] logger.info("Start predicting yes/no on Dev set.") for batch in tqdm(test_dataloader, desc="Testing"): if n_gpu == 1: batch = batch_to_device(batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs(batch, test_features, do_label=args.do_label, is_training=False) with torch.no_grad(): batch_choice_logits, _ = model(inputs['input_ids'], inputs['token_type_ids'], inputs['attention_mask'], sentence_span_list=inputs['sentence_span_list']) example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu().tolist() test_feature = test_features[example_index.item()] unique_id = int(test_feature.unique_id) all_results.append(RawResultChoice(unique_id=unique_id, choice_logits=choice_logits)) output_prediction_file = os.path.join(args.predict_dir, 'predictions.json') data_reader.write_predictions(eval_examples, eval_features, all_results, output_prediction_file, null_score_diff_threshold=0.0) yes_metric = data_reader.yesno_cate.f1_measure('yes', 'no') no_metric = data_reader.yesno_cate.f1_measure('no', 'yes') logger.info('Yes Metrics: %s' % json.dumps(yes_metric, indent=2)) logger.info('No Metrics: %s' % json.dumps(no_metric, indent=2)) # Labeling sentence id. if args.do_label and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = train_examples test_features = train_features test_tensors = data_reader.data_to_tensors(test_features) test_data = TensorDataset(*test_tensors) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.predict_batch_size) logger.info("***** Running labeling *****") logger.info(" Num orig examples = %d", len(test_examples)) logger.info(" Num split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] logger.info("Start labeling.") for batch in tqdm(test_dataloader, desc="Testing"): if n_gpu == 1: batch = batch_to_device(batch, device) inputs = data_reader.generate_inputs(batch, test_features, do_label=args.do_label, is_training=False) with torch.no_grad(): batch_choice_logits, batch_max_weight_indexes = model(inputs['input_ids'], inputs['token_type_ids'], inputs['attention_mask'], sentence_span_list=inputs['sentence_span_list']) example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu().tolist() max_weight_index = batch_max_weight_indexes[1][i].detach().cpu().tolist() max_weight = batch_max_weight_indexes[0][i].detach().cpu().tolist() test_feature = test_features[example_index.item()] unique_id = int(test_feature.unique_id) all_results.append( WeightResultChoice(unique_id=unique_id, choice_logits=choice_logits, max_weight_index=max_weight_index, max_weight=max_weight)) output_prediction_file = os.path.join(args.predict_dir, 'sentence_id_file.json') data_reader.predict_sentence_ids(test_examples, test_features, all_results, output_prediction_file, weight_threshold=args.weight_threshold, only_correct=args.only_correct, label_threshold=args.label_threshold)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="results", type=str, help="The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_config.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--no_cuda", action="store_true", help="Whether not to use CUDA when available" ) parser.add_argument( "--do_lower_case", default=True, type=bool, help="Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader." ) parser.add_argument( "--save_name", default='', type=str, help="save name for training.", ) parser.add_argument( "--tasks", default='', type=str, help="1-2-3... training task separate by -" ) parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training." ) parser.add_argument( "--baseline", action="store_true", help="whether use single stream baseline." ) parser.add_argument( "--zero_shot", action="store_true", help="whether use single stream baseline." ) parser.add_argument( "--split", default="", type=str, help="which split to use." ) parser.add_argument( "--batch_size", default=1, type=int, help="which split to use." ) args = parser.parse_args() with open('vlbert_tasks.yml', 'r') as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_pretrained_bert.modeling import BertConfig else: from vilbert.vilbert import BertConfig task_names = [] for i, task_id in enumerate(args.tasks.split('-')): task = 'TASK' + task_id name = task_cfg[task]['name'] task_names.append(name) # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] if '/' in args.from_pretrained: timeStamp = args.from_pretrained.split('/')[1] else: timeStamp = args.from_pretrained savePath = os.path.join(args.output_dir, timeStamp) config = BertConfig.from_json_file(args.config_file) bert_weight_name = json.load(open("config/" + args.bert_model + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16 ) ) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val \ = LoadDatasetEval(args, task_cfg, args.tasks.split('-')) num_labels = max([dataset.num_labels for dataset in task_datasets_val.values()]) config.fast_mode = True if args.zero_shot: model = BertForMultiModalPreTraining.from_pretrained(args.from_pretrained, config) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config, num_labels=num_labels, default_gpu=default_gpu ) task_losses = LoadLosses(args, task_cfg, args.tasks.split('-')) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, deay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) model.eval() # when run evaluate, we run each task sequentially. for task_id in task_ids: results = [] others = [] score_matrix = np.zeros((5000, 1000)) target_matrix = np.zeros((5000, 1000)) rank_matrix = np.ones((5000)) * 1000 count = 0 for i, batch in enumerate(task_dataloader_val[task_id]): batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch) features, spatials, image_mask, question, input_mask, segment_ids, target, caption_idx, image_idx = batch if task_id in ['TASK3']: batch_size = features.size(0) features = features.squeeze(0) spatials = spatials.squeeze(0) image_mask = image_mask.squeeze(0) with torch.no_grad(): if args.zero_shot: _, _, vil_logit, _ = model(question, features, spatials, segment_ids, input_mask, image_mask) score_matrix[caption_idx, image_idx*500:(image_idx+1)*500] = torch.softmax(vil_logit, dim=1)[:,0].view(-1).cpu().numpy() target_matrix[caption_idx, image_idx*500:(image_idx+1)*500] = target.view(-1).float().cpu().numpy() else: _, vil_logit, _, _, _, _, _ = model(question, features, spatials, segment_ids, input_mask, image_mask) score_matrix[caption_idx, image_idx*500:(image_idx+1)*500] = vil_logit.view(-1).cpu().numpy() target_matrix[caption_idx, image_idx*500:(image_idx+1)*500] = target.view(-1).float().cpu().numpy() if image_idx.item() == 1: rank = np.where((np.argsort(-score_matrix[caption_idx]) == np.where(target_matrix[caption_idx]==1)[0][0]) == 1)[0][0] rank_matrix[caption_idx] = rank rank_matrix_tmp = rank_matrix[:caption_idx+1] r1 = 100.0 * np.sum(rank_matrix_tmp < 1) / len(rank_matrix_tmp) r5 = 100.0 * np.sum(rank_matrix_tmp < 5) / len(rank_matrix_tmp) r10 = 100.0 * np.sum(rank_matrix_tmp < 10) / len(rank_matrix_tmp) medr = np.floor(np.median(rank_matrix_tmp) + 1) meanr = np.mean(rank_matrix_tmp) + 1 print("%d Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" %(count, r1, r5, r10, medr, meanr)) results.append(np.argsort(-score_matrix[caption_idx]).tolist()[:20]) count += 1 r1 = 100.0 * np.sum(rank_matrix < 1) / len(rank_matrix) r5 = 100.0 * np.sum(rank_matrix < 5) / len(rank_matrix) r10 = 100.0 * np.sum(rank_matrix < 10) / len(rank_matrix) medr = np.floor( np.median(rank_matrix) + 1) meanr = np.mean(rank_matrix) + 1 print("************************************************") print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" %(r1, r5, r10, medr, meanr)) print("************************************************") if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]['val_split']) json.dump(results, open(json_path+ '_result.json', 'w')) json.dump(others, open(json_path+ '_others.json', 'w'))
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="results", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_config.json", type=str, help="The config file which specified the model details.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument("--num_workers", type=int, default=10, help="Number of workers in the dataloader.") parser.add_argument( "--save_name", default='', type=str, help="save name for training.", ) parser.add_argument("--batch_size", default=1000, type=int, help="what is the batch size?") parser.add_argument("--tasks", default='', type=str, help="1-2-3... training task separate by -") parser.add_argument("--in_memory", default=False, type=bool, help="whether use chunck for parallel training.") parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--split", default="", type=str, help="which split to use.") args = parser.parse_args() with open('vlbert_tasks.yml', 'r') as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_pretrained_bert.modeling import BertConfig from vilbert.basebert import BaseBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] for i, task_id in enumerate(args.tasks.split('-')): task = 'TASK' + task_id name = task_cfg[task]['name'] task_names.append(name) # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] timeStamp = args.from_pretrained.split('/')[1] + '-' + args.save_name savePath = os.path.join(args.output_dir, timeStamp) config = BertConfig.from_json_file(args.config_file) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val \ = LoadDatasetEval(args, task_cfg, args.tasks.split('-')) tbLogger = utils.tbLogger(timeStamp, savePath, task_names, task_ids, task_num_iters, 1, save_logger=False, txt_name='eval.txt') num_labels = max( [dataset.num_labels for dataset in task_datasets_val.values()]) if args.baseline: model = BaseBertForVLTasks.from_pretrained(args.from_pretrained, config, num_labels=num_labels, default_gpu=default_gpu) else: model = VILBertForVLTasks.from_pretrained(args.from_pretrained, config, num_labels=num_labels, default_gpu=default_gpu) task_losses = LoadLosses(args, task_cfg, args.tasks.split('-')) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) model.eval() for task_id in task_ids: results = [] others = [] for i, batch in tqdm(enumerate(task_dataloader_val[task_id])): loss, score, batch_size, results, others = EvaluatingModel(args, task_cfg, device, \ task_id, batch, model, task_dataloader_val, task_losses, results, others) tbLogger.step_val(0, float(loss), float(score), task_id, batch_size, 'val') sys.stdout.write('%d/%d\r' % (i, len(task_dataloader_val[task_id]))) sys.stdout.flush() # save the result or evaluate the result. ave_score = tbLogger.showLossVal() if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]['val_split']) json.dump(results, open(json_path + '_result.json', 'w')) json.dump(others, open(json_path + '_others.json', 'w'))
'Train loss: %.3f; agg mIoU: %.3f' % (train_loss / (i_batch + 1), np.mean(np.nan_to_num(score_train["iou"])))) pass score_train, score_train_global, score_train_local = trainer.get_scores() trainer.reset_metrics() # torch.cuda.empty_cache() #print("Epoch:", epoch, "training finished on rank:", torch.distributed.get_rank()) torch.distributed.barrier() #print("Will start next training/testing on rank :", torch.distributed.get_rank()) if (epoch + 1) % 5 == 0: with torch.no_grad(): model_ddp.eval() print("\n" + "evaluating after epoch: ", epoch) if test: tbar = tqdm(dataloader_test) else: tbar = tqdm(dataloader_val) for i_batch, sample_batched in enumerate(tbar): predictions, predictions_global, predictions_local = evaluator.eval_test( sample_batched, model_ddp, global_fixed) score_val, score_val_global, score_val_local = evaluator.get_scores( ) if mode == 1: tbar.set_description( 'global mIoU: %.3f' %
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--percent", default=100, type=int, help="The percentage of examples used in the training data.\n") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, # 42 help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--pretrain', action='store_true', help="Whether to load a pre-trained model for continuing training") parser.add_argument('--pretrained_model_file', type=str, help="The path of the pretrained_model_file") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"srl": SrlProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) # np.random.shuffle(train_examples) # random_debug_list = np.arange(100) # np.random.shuffle(random_debug_list) # print('random debug list', random_debug_list) train_examples = train_examples[:int(len(train_examples) * args.percent / 100)] num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) model = BertForSRLSingleV2.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.pretrain: # Load a pre-trained model print('load a pre-trained model from ' + args.pretrained_model_file) pretrained_state_dict = torch.load(args.pretrained_model_file) model_state_dict = model.state_dict() print('pretrained_state_dict', pretrained_state_dict.keys()) print('model_state_dict', model_state_dict.keys()) pretrained_state = {k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.size() == model_state_dict[k].size()} print('pretrained_state', pretrained_state.keys()) model_state_dict.update(pretrained_state) print('updated_state_dict', model_state_dict.keys()) # print('bert.encoder.layer.1.output.dense.weight', model_state_dict['bert.encoder.layer.1.output.dense.weight']) # print('hidden_ESRL.attention.self.key.weight', model_state_dict['hidden_ESRL.attention.self.key.weight']) model.load_state_dict(model_state_dict) model.to(device) for param in model.bert.parameters(): param.requires_grad = False for param in model.hidden_ESRL.parameters(): param.requires_grad = False if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: if num_train_optimization_steps is None: num_train_optimization_steps = 0 optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_predicate_ids = torch.tensor([f.predicate_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_predicate_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, predicate_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, predicate_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 print('train loss', tr_loss) # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) label_map = {i: label for i, label in enumerate(label_list, 0)} model_config = {"bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map} json.dump(model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) model_state_dict = torch.load(output_model_file) model = BertForSRLSingleV2.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_predicate_ids = torch.tensor([f.predicate_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_predicate_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 0)} for input_ids, predicate_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) predicate_ids = predicate_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, predicate_ids, segment_ids, input_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, mask in enumerate(input_mask): temp_1 = [] temp_2 = [] for j, m in enumerate(mask): if j == 0: continue if m: if label_map[label_ids[i][j]] != "X": temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) else: temp_1.pop() temp_2.pop() break if temp_1[-1] == '[SEP]': temp_1.pop() temp_2.pop() y_true.append(temp_1) y_pred.append(temp_2) report = classification_report(y_true, y_pred, digits=4) prediction_file = os.path.join(args.output_dir, 'predictions.txt') write_predictions(eval_examples, y_true, y_pred, prediction_file) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument("--bert_original", action='store_true', help="To run for original BERT") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() processors = { "nsp": NSPProcessor, } num_labels_task = { "nsp": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) print('BERT original model loaded') if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * WarmupLinearSchedule( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # save model torch.save(model.state_dict(), os.path.join(args.output_dir, 'nsp_model.pt')) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=8.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() processors = { "obqa": OBQAProcessor, "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "snli": SnliProcessor, "qnli": QnliProcessor, "bow": BoWProcessor } num_labels_task = { "obqa": 2, "cola": 2, "mnli": 3, "mrpc": 2, "snli": 3, "qnli": 2, "bow": 2 } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) test_examples = processor.get_test_examples(args.data_dir) dev_examples = processor.get_dev_examples(args.data_dir) # Prepare model model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 nb_tr_steps = 0 tr_loss = 0 result = {} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer) dev_features = convert_examples_to_features(dev_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) train_dataloader = get_dataloader(train_features, args.train_batch_size) test_dataloader = get_dataloader(test_features, args.train_batch_size) dev_dataloader = get_dataloader(dev_features, args.train_batch_size) ep = 0 output_model_file = "dummy" loss_fct = CrossEntropyLoss() max_acc = -1 for _ in trange(int(args.num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 ep += 1 tqi = tqdm(train_dataloader, desc="Iteration") acc = 0 for step, batch in enumerate(tqi): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask) loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_accuracy = accuracy(logits, label_ids) acc += tmp_accuracy if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 tqi.set_description("Loss:" + str(tr_loss / nb_tr_steps) + ",Acc:" + str(acc / nb_tr_examples)) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin." + str(ep)) torch.save(model_to_save.state_dict(), output_model_file) model.eval() names = ["Val", "Test"] for index, eval_dataloader in enumerate( [dev_dataloader, test_dataloader]): print("Validating! " + names[index]) eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 tqe = tqdm(eval_dataloader, desc="Evaluating") for input_ids, input_mask, segment_ids, label_ids in tqe: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 tqe.set_description(names[index] + " Loss:" + str(eval_loss / nb_eval_steps) + ",Acc:" + str(eval_accuracy / nb_eval_examples)) eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None eps = str(ep) + "_" + names[index] result['eval_loss_' + eps] = eval_loss result['eval_accuracy_' + eps] = eval_accuracy result['global_step_' + eps] = global_step result['loss_' + eps] = loss print(result) if names[index] == "Val": max_acc = max(max_acc, eval_accuracy) if eval_accuracy == max_acc: print("Storing Current Best Model") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "best_model.bin") torch.save(model_to_save.state_dict(), output_model_file) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))