def save_checkpoint(model, tokenizer, args, epoch, global_step): checkpoint_dir = op.join(args.output_dir, 'checkpoint-{}-{}'.format(epoch, global_step)) mkdir(checkpoint_dir) model_to_save = model.module if hasattr(model, 'module') else model save_num = 0 while (save_num < 10): try: model_to_save.save_pretrained(checkpoint_dir) torch.save(args, op.join(checkpoint_dir, 'training_args.bin')) tokenizer.save_pretrained(checkpoint_dir) logger.info("Save checkpoint to {}".format(checkpoint_dir)) break except: save_num += 1 if save_num == 10: logger.info("Failed to save checkpoint after 10 trails.") return checkpoint_dir
def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default='./datasets/coco_ir/', type=str, required=False, help="The input data dir with all required files.") parser.add_argument("--img_feat_file", default='/disk2/11811112/Oscar/coco_ir/features.tsv', type=str, required=False, help="The absolute address of the image feature file.") parser.add_argument( "--model_name_or_path", default=None, type=str, required=False, help="Path to pre-trained model or model type. required for training.") parser.add_argument( "--output_dir", default='output/', type=str, required=False, help="The output directory to save checkpoint and test results.") parser.add_argument("--loss_type", default='sfmx', type=str, help="Loss function types: support kl, sfmx") parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name.") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name." ) parser.add_argument( "--max_seq_length", default=70, type=int, help="The maximum total input sequence length after tokenization. " "Sequences longer than this will be truncated, " "sequences shorter will be padded." "This number is calculated on COCO dataset" "If add object detection labels, the suggested length should be 70.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run inference.") parser.add_argument( "--do_eval", action='store_true', help="Whether to run performance valuation." "do not activate if we want to inference on dataset without gt labels." ) parser.add_argument("--test_split", default='test', type=str, help='data split name.') parser.add_argument( "--eval_img_keys_file", default='', type=str, help="image key tsv to select a subset of images for evaluation. " "This is useful in 5-folds evaluation. The topn index file is not " "needed in this case.") parser.add_argument( "--eval_caption_index_file", default='', type=str, help="index of a list of (img_key, cap_idx) for each image." "this is used to perform re-rank using hard negative samples." "useful for validation set to monitor the performance during training." ) parser.add_argument( "--cross_image_eval", action='store_true', help= "perform cross image inference, ie. each image with all texts from other images." ) parser.add_argument("--add_od_labels", default=False, action='store_true', help="Whether to add object detection labels or not.") parser.add_argument("--od_label_type", default='vg', type=str, help="label type, support vg, gt, oid") parser.add_argument( "--att_mask_type", default='CLR', type=str, help="attention mask type, support ['CL', 'CR', 'LR', 'CLR']" "C: caption, L: labels, R: image regions; CLR is full attention by default." "CL means attention between caption and labels." "please pay attention to the order CLR, which is the default concat order." ) parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--drop_out", default=0.1, type=float, help="Drop out in BERT.") parser.add_argument("--max_img_seq_length", default=50, type=int, help="The maximum total input image sequence length.") parser.add_argument("--img_feature_dim", default=2054, type=int, help="The Image Feature Dimension.") parser.add_argument("--img_feature_type", default='frcnn', type=str, help="Image feature type.") parser.add_argument("--use_img_layernorm", type=int, default=1, help="Normalize image features with bertlayernorm") parser.add_argument("--img_layer_norm_eps", default=1e-12, type=float, help="The eps in image feature laynorm layer") parser.add_argument("--per_gpu_train_batch_size", default=2, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=2, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--output_mode", default='classification', type=str, help="output mode, support classification or regression.") parser.add_argument( "--num_labels", default=2, type=int, help="num_labels is 2 for classification and 1 for regression.") parser.add_argument( "--num_captions_per_img_train", default=5, type=int, help="number of positive matched captions for each training image.") parser.add_argument("--num_captions_per_img_val", default=5, type=int, help="number of captions for each testing image.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before backward.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial lr.") parser.add_argument("--weight_decay", default=0.05, type=float, help="Weight deay.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup.") parser.add_argument("--scheduler", default='linear', type=str, help="constant or linear.") parser.add_argument("--num_workers", default=4, type=int, help="Workers in dataloader.") parser.add_argument("--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help="Total number of training steps. Override num_train_epochs.") parser.add_argument('--logging_steps', type=int, default=20, help="Log every X steps.") parser.add_argument( '--save_steps', type=int, default=-1, help="Save checkpoint every X steps. Will also perform evaluatin.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Run evaluation during training at each save_steps.") parser.add_argument("--eval_model_dir", type=str, default='./output0320/checkpoint-29-66390/', help="Model directory for evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA.") parser.add_argument('--seed', type=int, default=88, help="random seed for initialization.") args = parser.parse_args() global logger mkdir(args.output_dir) logger = setup_logger("vlpretrain", args.output_dir, 0) args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args.seed, args.n_gpu) logger.warning("Device: %s, n_gpu: %s", args.device, args.n_gpu) logger.info('output_mode: {}, #Labels: {}'.format(args.output_mode, args.num_labels)) config_class, tokenizer_class = BertConfig, BertTokenizer model_class = ImageBertForSequenceClassification checkpoint = args.eval_model_dir assert op.isdir(checkpoint) config = config_class.from_pretrained(checkpoint) tokenizer = tokenizer_class.from_pretrained(checkpoint) model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) # inference and evaluation args = restore_training_settings(args) test_dataset = RetrievalDataset(tokenizer, args, args.test_split, is_train=False) checkpoint = args.eval_model_dir assert op.isdir(checkpoint) model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) print() if args.do_test or args.do_eval: args = restore_training_settings(args) test_dataset = RetrievalDataset(tokenizer, args, args.test_split, is_train=False) checkpoint = args.eval_model_dir assert op.isdir(checkpoint) logger.info("Evaluate the following checkpoint: %s", checkpoint) model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) result = get_intermediate_data(args, model.module, test_dataset) #得到中间数据 ##test_result = test(args, model, test_dataset) mediate_file = op.basename("mediate_file.txt") torch.save(str(result), mediate_file) logger.info("Prediction results saved to {}.".format(mediate_file))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=False, help="The input data dir. " "Should contain the .yaml files for the task.") parser.add_argument("--dataset_file", default=None, type=str, required=True, help="The training dataset yaml file.") parser.add_argument("--extra_dataset_file", default=None, type=str, required=False, help="The extra training dataset yaml file.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) # image chunks parser.add_argument("--chunk_start_id", default=-1, type=int, help="Image Chunk Start ID") parser.add_argument("--chunk_end_id", default=-1, type=int, help="Image Chunk End ID") ## Image parameters parser.add_argument("--max_img_seq_length", default=50, type=int, help="The maximum total input image sequence length.") parser.add_argument("--img_feature_dim", default=2054, type=int, help="The Image Feature Dimension.") parser.add_argument("--img_feature_type", default='faster_r-cnn', type=str, help="faster_r-cnn or mask_r-cnn") parser.add_argument("--use_layernorm", action='store_true', help="use_layernorm") parser.add_argument("--drop_out", default=0.1, type=float, help="Drop out for BERT.") parser.add_argument("--use_b", type=int, default=1, help="use_b") parser.add_argument("--textb_sample_mode", type=int, default=0, help="0: sample from both texta&textb, " "1: sample from textb, " "2: sample from QA answers") parser.add_argument("--extra_textb_sample_mode", type=int, default=1) parser.add_argument( "--texta_false_prob", type=float, default=0.0, help="the probality that we sample wrong texta, should in [0.0, 0.5]") parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=35, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--max_iters", default=2000000, type=int, help="Maximal number of training iterations.") parser.add_argument("--train_batch_size", default=1024, type=int, help="Batch size for training.") parser.add_argument("--num_workers", default=6, type=int, help="Number of workers for dataset.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument( "--optim", default='adamw', type=str, help="The optimizer used for Bert, [adamw, lamb], default: adamw") parser.add_argument("--max_grad_norm", default=-1.0, type=float, help="Max gradient norm.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument("--from_scratch", action='store_true', help="train from scratch") parser.add_argument("--use_img_layernorm", type=int, default=0, help="Normalize image features with bertlayernorm") parser.add_argument("--img_layer_norm_eps", default=1e-12, type=float, help="The eps in image feature laynorm layer") # distributed parser.add_argument('--gpu_ids', type=str, default='-1') parser.add_argument( "--mask_loss_for_unmatched", type=int, default=1, help="masked language model loss for unmatched triplets") parser.add_argument( "--extra_loss_weight", type=float, default=0.0, help= "the loss weight for the extra train data batch (should be in [0,1])") parser.add_argument("--use_gtlabels", type=int, default=1, help="use groundtruth labels for text b or not") # logging parser.add_argument('--ckpt_period', type=int, default=10000, help="Period for saving checkpoint") parser.add_argument('--log_period', type=int, default=100, help="Period for saving logging info") args = parser.parse_args() if args.gpu_ids != '-1': os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids args.num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = args.num_gpus > 1 if args.gpu_ids != '-1': os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: logger.info("Output Directory Exists.") # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method="env://") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if not os.path.exists(args.output_dir): mkdir(args.output_dir) last_checkpoint_dir = None arguments = {"iteration": 0} if os.path.exists(args.output_dir): save_file = os.path.join(args.output_dir, "last_checkpoint") try: with open(save_file, "r") as f: last_saved = f.read() last_saved = last_saved.strip() except IOError: # if file doesn't exist, maybe because it has just been # deleted by a separate process last_saved = "" if last_saved: folder_name = os.path.splitext( last_saved.split('/')[0] )[0] # in the form of checkpoint-00001 or checkpoint-00001/pytorch_model.bin last_checkpoint_dir = os.path.join(args.output_dir, folder_name) arguments["iteration"] = int(folder_name.split('-')[-1]) assert os.path.isfile( os.path.join(last_checkpoint_dir, WEIGHTS_NAME) ), "Last_checkpoint detected, but file not found!" # model first if get_rank() != 0: torch.distributed.barrier() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.bert_model] if last_checkpoint_dir is not None: # recovery args.model_name_or_path = last_checkpoint_dir logger.info(" -> Recovering model from {}".format(last_checkpoint_dir)) config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, ) config.img_layer_norm_eps = args.img_layer_norm_eps config.use_img_layernorm = args.use_img_layernorm # discrete code config.img_feature_dim = args.img_feature_dim config.img_feature_type = args.img_feature_type config.hidden_dropout_prob = args.drop_out if args.texta_false_prob < 0.5 and (args.texta_false_prob > 0 or not args.use_b): args.num_contrast_classes = 3 else: args.num_contrast_classes = 2 config.num_contrast_classes = args.num_contrast_classes # Prepare model # model = BertForPreTraining.from_pretrained(args.bert_model) load_num = 0 while load_num < 10: try: model = BertImgForPreTraining.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) break except: load_num += 1 # train from scratch if args.from_scratch: if last_checkpoint_dir is None: logger.info("Training from scratch ... ") model.apply(model.init_weights) total_params = sum(p.numel() for p in model.parameters()) logger.info('Total Parameters: {}'.format(total_params)) for key, val in vars(config).items(): setattr(args, key, val) if get_rank() == 0 and args.local_rank != -1: torch.distributed.barrier() model.to(args.device) logger.info("Training/evaluation parameters %s", args) tb_log_dir = os.path.join(args.output_dir, 'train_logs') meters = TensorboardLogger( log_dir=tb_log_dir, delimiter=" ", ) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.max_iters) if arguments['iteration'] > 0 and os.path.isfile( os.path.join(last_checkpoint_dir, 'optimizer.pth')): # recovery logger.info("Load BERT optimizer from {}".format(last_checkpoint_dir)) optimizer_to_load = torch.load(os.path.join(last_checkpoint_dir, 'optimizer.pth'), map_location=torch.device("cpu")) optimizer.load_state_dict(optimizer_to_load.pop("optimizer")) scheduler.load_state_dict(optimizer_to_load.pop("scheduler")) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # train_examples = None train_dataloaders = make_data_loader(args, is_distributed=args.distributed, arguments=arguments) if isinstance(train_dataloaders, list): train_dataloader = train_dataloaders[0] else: train_dataloader = train_dataloaders train_dataloader_extra = [None] * len(train_dataloader) if isinstance(train_dataloaders, list) and len(train_dataloaders) > 1: logger.info("Having two train dataloaders!") train_dataloader_extra = train_dataloaders[1] tokenizer = train_dataloader.dataset.tokenizer # torch.backends.cudnn.benchmark = True max_iter = len(train_dataloader) start_iter = arguments["iteration"] logger.info("***** Running training *****") logger.info(" Num examples = {}".format(len(train_dataloader.dataset))) logger.info(" Instantaneous batch size = %d", args.train_batch_size // args.gradient_accumulation_steps) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", max_iter // args.gradient_accumulation_steps) log_json = {} model.train() model.zero_grad() clock_started = False # Every args.ckpt_period, report train_score and save model tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, (batch, batch_extra) in enumerate( zip(train_dataloader, train_dataloader_extra), start_iter): if not clock_started: start_training_time = time.time() end = time.time() clock_started = True def data_process(mini_batch): images, targets, qa_inds = \ mini_batch[0], mini_batch[1], mini_batch[2] targets_transposed = list(zip(*targets)) input_ids = torch.stack(targets_transposed[0]).to( args.device, non_blocking=True) input_mask = torch.stack(targets_transposed[1]).to( args.device, non_blocking=True) segment_ids = torch.stack(targets_transposed[2]).to( args.device, non_blocking=True) lm_label_ids = torch.stack(targets_transposed[3]).to( args.device, non_blocking=True) is_next = torch.stack(targets_transposed[4]).to(args.device, non_blocking=True) is_img_match = torch.stack(targets_transposed[5]).to( args.device, non_blocking=True) return images, input_ids, input_mask, segment_ids, lm_label_ids, is_next images1, input_ids1, input_mask1, segment_ids1, lm_label_ids1, is_next1 \ = data_process(batch) if batch_extra is not None: images2, input_ids2, input_mask2, segment_ids2, lm_label_ids2, is_next2 \ = data_process(batch_extra) data_time = time.time() - end def forward_backward(images, input_ids, input_mask, segment_ids, lm_label_ids, is_next, loss_weight=1.0): # feature as input image_features = torch.stack(images).to(args.device, non_blocking=True) outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, img_feats=image_features) loss = loss_weight * outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() return loss.item(), input_ids.size(0) start1 = time.time() loss1, nb_tr_example1 = forward_backward(images1, input_ids1, input_mask1, segment_ids1, lm_label_ids1, is_next1, loss_weight=1.0 - args.extra_loss_weight) tr_loss += loss1 nb_tr_examples += nb_tr_example1 compute_time1 = time.time() - start1 loss2, nb_tr_example2 = 0.0, 0 compute_time2 = 0.0 if batch_extra is not None: start2 = time.time() loss2, nb_tr_example2 = forward_backward( images2, input_ids2, input_mask2, segment_ids2, lm_label_ids2, is_next2, loss_weight=args.extra_loss_weight) tr_loss += loss2 nb_tr_examples += nb_tr_example2 compute_time2 = time.time() - start2 nb_tr_steps += 1 arguments["iteration"] = step + 1 if (step + 1) % args.gradient_accumulation_steps == 0: # do gradient clipping if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) # do the optimization steps optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() # measure elapsed time batch_time = time.time() - end end = time.time() metrics_to_log = { 'time_info': { 'compute': batch_time, 'data': data_time, 'compute1': compute_time1, 'compute2': compute_time2 }, 'batch_metrics': { 'loss': loss1 + loss2 } } params_to_log = { 'params': { 'bert_lr': optimizer.param_groups[0]["lr"] } } meters.update_metrics(metrics_to_log) meters.update_params(params_to_log) if args.log_period > 0 and (step + 1) % args.log_period == 0: avg_time = meters.meters['time_info']['compute'].global_avg eta_seconds = avg_time * (max_iter - step - 1) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=step + 1, memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, ) + "\n " + meters.get_logs(step + 1)) if (step + 1) == max_iter or ( step + 1) % args.ckpt_period == 0: # Save a trained model log_json[step + 1] = tr_loss train_metrics_total = torch.Tensor( [tr_loss, nb_tr_examples, nb_tr_steps]).to(args.device) torch.distributed.all_reduce(train_metrics_total) # reset metrics tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 if get_rank() == 0: # report metrics train_score_gathered = train_metrics_total[0] / \ train_metrics_total[2] logger.info("PROGRESS: {}%".format( round(100 * (step + 1) / max_iter, 4))) logger.info("EVALERR: {}%".format(train_score_gathered)) meters.update_metrics({ 'epoch_metrics': { 'ex_cnt': train_metrics_total[1], 'loss': train_score_gathered } }) with open(os.path.join(args.output_dir, 'loss_logs.json'), 'w') as fp: json.dump(log_json, fp) # save checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{:07d}'.format(step + 1)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training optimizer_to_save = { "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict() } save_num = 0 while save_num < 10: try: model_to_save.save_pretrained(output_dir) torch.save( args, os.path.join(output_dir, 'training_args.bin')) tokenizer.save_pretrained(output_dir) torch.save(optimizer_to_save, os.path.join(output_dir, 'optimizer.pth')) save_file = os.path.join(args.output_dir, "last_checkpoint") with open(save_file, "w") as f: f.write( 'checkpoint-{:07d}/pytorch_model.bin'.format( step + 1)) break except: save_num += 1 logger.info("Saving model checkpoint {0} to {1}".format( step + 1, output_dir)) if clock_started: total_training_time = time.time() - start_training_time else: total_training_time = 0.0 total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) # close the tb logger meters.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default='datasets/coco_caption', type=str, required=False, help="The input data dir with all required files.") parser.add_argument("--train_yaml", default='train.yaml', type=str, required=False, help="yaml file for training.") parser.add_argument("--test_yaml", default='test.yaml', type=str, required=False, help="yaml file for testing.") parser.add_argument("--val_yaml", default='val.yaml', type=str, required=False, help="yaml file used for validation during training.") parser.add_argument("--model_name_or_path", default=None, type=str, required=False, help="Path to pre-trained model or model type.") parser.add_argument( "--output_dir", default='output/', type=str, required=False, help="The output directory to save checkpoint and test results.") parser.add_argument("--loss_type", default='sfmx', type=str, help="Loss function types: support kl, x2, sfmx") parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name.") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name." ) parser.add_argument( "--max_seq_length", default=70, type=int, help="The maximum total input sequence length after tokenization. " "Sequences longer than this will be truncated, " "sequences shorter will be padded.") parser.add_argument("--max_seq_a_length", default=40, type=int, help="The maximum sequence length for caption.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run inference.") parser.add_argument("--do_eval", action='store_true', help="Whether to run evaluation.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument( "--mask_prob", default=0.15, type=float, help="Probability to mask input sentence during training.") parser.add_argument("--max_masked_tokens", type=int, default=3, help="The max number of masked tokens per sentence.") parser.add_argument("--add_od_labels", default=False, action='store_true', help="Whether to add object detection labels or not") parser.add_argument( "--disable_img_features", default=False, action='store_true', help="Whether to disable image feature in finetuning state or not") parser.add_argument( '--keep_top_percentage_tag_conf_threshold', type=float, default=0.3, help="Confidence threshold k for keep_top_percengate_tag") parser.add_argument( '--keep_top_percentage_tag', type=float, default=1, help= "Keep input percentage features at inference time given that >= k confidence" ) parser.add_argument("--drop_out", default=0.1, type=float, help="Drop out in BERT.") parser.add_argument("--max_img_seq_length", default=50, type=int, help="The maximum total input image sequence length.") parser.add_argument("--img_feature_dim", default=2054, type=int, help="The Image Feature Dimension.") parser.add_argument("--img_feature_type", default='frcnn', type=str, help="Image feature type.") parser.add_argument("--per_gpu_train_batch_size", default=64, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--output_mode", default='classification', type=str, help="output mode, support classification or regression.") parser.add_argument( "--num_labels", default=2, type=int, help="num_labels is 2 for classification and 1 for regression.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before backward.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial lr.") parser.add_argument("--weight_decay", default=0.05, type=float, help="Weight deay.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup.") parser.add_argument("--scheduler", default='linear', type=str, help="constant or linear or") parser.add_argument("--num_workers", default=4, type=int, help="Workers in dataloader.") parser.add_argument("--num_train_epochs", default=40, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help="Total number of training steps. Override num_train_epochs.") parser.add_argument('--logging_steps', type=int, default=20, help="Log every X steps.") parser.add_argument( '--save_steps', type=int, default=-1, help="Save checkpoint every X steps. Will also perform evaluatin.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Run evaluation during training at each save_steps.") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA.") parser.add_argument('--seed', type=int, default=88, help="random seed for initialization.") parser.add_argument('--scst', action='store_true', help='Self-critical sequence training') # for generation parser.add_argument("--eval_model_dir", type=str, default='', help="Model directory for evaluation.") parser.add_argument('--max_gen_length', type=int, default=20, help="max length of generated sentences") parser.add_argument('--output_hidden_states', action='store_true', help="Turn on for fast decoding") parser.add_argument('--num_return_sequences', type=int, default=1, help="repeating times per image") parser.add_argument('--num_beams', type=int, default=5, help="beam search width") parser.add_argument('--num_keep_best', type=int, default=1, help="number of hypotheses to keep in beam search") parser.add_argument('--temperature', type=float, default=1, help="temperature in softmax for sampling") parser.add_argument('--top_k', type=int, default=0, help="filter distribution for sampling") parser.add_argument('--top_p', type=float, default=1, help="filter distribution for sampling") parser.add_argument( '--repetition_penalty', type=int, default=1, help= "repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)" ) parser.add_argument('--length_penalty', type=int, default=1, help="beam search length penalty") # for Constrained Beam Search parser.add_argument('--use_cbs', action='store_true', help='Use constrained beam search for decoding') parser.add_argument('--min_constraints_to_satisfy', type=int, default=2, help="minimum number of constraints to satisfy") args = parser.parse_args() global logger args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() output_dir = args.output_dir mkdir(output_dir) logger = setup_logger("vlpretrain", output_dir, 0) logger.warning("Device: %s, n_gpu: %s", args.device, args.n_gpu) set_seed(args.seed, args.n_gpu) # Load pretrained model and tokenizer config_class, model_class, tokenizer_class = BertConfig, BertForImageCaptioning, BertTokenizer if args.do_train: assert args.model_name_or_path is not None config = config_class.from_pretrained(args.config_name if args.config_name else \ args.model_name_or_path, num_labels=args.num_labels, finetuning_task='image_captioning') if args.scst: # avoid using too much memory config.output_hidden_states = True tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name \ else args.model_name_or_path, do_lower_case=args.do_lower_case) config.img_feature_dim = args.img_feature_dim config.img_feature_type = args.img_feature_type config.hidden_dropout_prob = args.drop_out config.loss_type = args.loss_type model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) else: checkpoint = args.eval_model_dir assert op.isdir(checkpoint) config = config_class.from_pretrained(checkpoint) config.output_hidden_states = args.output_hidden_states tokenizer = tokenizer_class.from_pretrained(checkpoint) logger.info("Evaluate the following checkpoint: %s", checkpoint) model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) logger.info("Training/evaluation parameters %s", args) if args.do_train: train_dataset = build_dataset(op.join(args.data_dir, args.train_yaml), tokenizer, args) val_dataset = build_dataset(op.join(args.data_dir, args.val_yaml), tokenizer, args, is_train=False) global_step, avg_loss = train(args, train_dataset, val_dataset, model, tokenizer) logger.info("Training done: total_step = %s, avg loss = %s", global_step, avg_loss) # inference and evaluation if args.do_test or args.do_eval: args = restore_training_settings(args) test_dataset = build_dataset(op.join(args.data_dir, args.test_yaml), tokenizer, args, is_train=False) if args.n_gpu > 1: model = torch.nn.DataParallel(model) if not args.do_eval: predict_file = get_predict_file(checkpoint, test_dataset.yaml_file, args) test(args, test_dataset, model, tokenizer, predict_file) logger.info("Prediction results saved to: {}".format(predict_file)) else: evaluate_file = evaluate(args, test_dataset, model, tokenizer, checkpoint) logger.info( "Evaluation results saved to: {}".format(evaluate_file))
def main(): args = get_args() global logger # global logger, writer # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device if args.do_train: mkdir(args.output_dir) t = datetime.today() args.output_dir = op.join( args.output_dir, f"{t.month}_{t.day}_{t.hour}_{t.minute}_{t.second}") if not op.exists(args.output_dir): mkdir(args.output_dir) logger = setup_logger("vlpretrain", args.output_dir, args.local_rank) else: logger = setup_logger("vlpretrain", os.path.dirname(args.eval_model_dir), args.local_rank, 'test_log.txt') logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1)) set_seed(args.seed, args.n_gpu) # writer = SummaryWriter(log_dir=args.output_dir, flush_secs=60) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() config_class, model_class, tokenizer_class = BertConfig, BertForImageCaptioning, BertTokenizer if args.do_train: assert args.model_name_or_path is not None config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=args.num_labels, finetuning_task='image_captioning') if args.scst: # avoid using too much memory config.output_hidden_states = True tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) config.img_feature_dim = args.img_feature_dim config.img_feature_type = args.img_feature_type config.hidden_dropout_prob = args.drop_out config.loss_type = args.loss_type model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) else: assert op.isdir(args.eval_model_dir) config = config_class.from_pretrained(args.eval_model_dir) config.output_hidden_states = args.output_hidden_states tokenizer = tokenizer_class.from_pretrained(args.eval_model_dir) logger.info("Evaluate the following checkpoint: %s", args.eval_model_dir) model = model_class.from_pretrained(args.eval_model_dir, config=config) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) logger.info("Training/evaluation parameters %s", args) if args.do_train: train_dataset = build_dataset('train', tokenizer, args) val_dataset = build_dataset('dev', tokenizer, args, is_train=False) global_step, avg_loss = train(args, train_dataset, val_dataset, model, tokenizer) logger.info("Training done: total_step = %s, avg loss = %s", global_step, avg_loss) # # inference and evaluation # if args.do_test or args.do_eval: # args = restore_training_settings(args) # test_dataset = build_dataset('test', tokenizer, args, is_train=False) # if args.n_gpu > 1: # model = torch.nn.DataParallel(model) # if not args.do_eval: # predict_file = get_predict_file('test', args.eval_model_dir, args) # test(args, test_dataset, model, tokenizer, predict_file) # logger.info("Prediction results saved to: {}".format(predict_file)) # else: # evaluate_file = evaluate(args, test_dataset, model, tokenizer, # args.eval_model_dir) # logger.info( # "Evaluation results saved to: {}".format(evaluate_file)) if args.do_test and args.local_rank in [-1, 0]: args = restore_training_settings(args) test_dataset = build_dataset('test', tokenizer, args, is_train=False) if args.n_gpu > 1: model = torch.nn.DataParallel(model) predict_file = get_predict_file('test', args.eval_model_dir, args) test(args, test_dataset, model, tokenizer, predict_file) logger.info("Prediction results saved to: {}".format(predict_file)) if args.do_eval and args.local_rank in [-1, 0]: args = restore_training_settings(args) dev_dataset = build_dataset('dev', tokenizer, args, is_train=False) if args.n_gpu > 1: model = torch.nn.DataParallel(model) predict_file = get_predict_file('dev', args.eval_model_dir, args) test(args, dev_dataset, model, tokenizer, predict_file) logger.info("Prediction results saved to: {}".format(predict_file))
def main(): args = get_args() global logger args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() mkdir(args.output_dir) logger = setup_logger("vlpretrain", args.output_dir, 0) logger.warning("Device: %s, n_gpu: %s", args.device, args.n_gpu) set_seed(args.seed, args.n_gpu) # Load pretrained model and tokenizer config_class, model_class, tokenizer_class = BertConfig, BertForImageCaptioning, BertTokenizer if args.do_train: assert args.model_name_or_path is not None config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=args.num_labels, finetuning_task='image_captioning') if args.scst: # avoid using too much memory config.output_hidden_states = True tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) config.img_feature_dim = args.img_feature_dim config.img_feature_type = args.img_feature_type config.hidden_dropout_prob = args.drop_out config.loss_type = args.loss_type model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) else: checkpoint = args.eval_model_dir assert op.isdir(checkpoint) config = config_class.from_pretrained(checkpoint) config.output_hidden_states = args.output_hidden_states tokenizer = tokenizer_class.from_pretrained(checkpoint) logger.info("Evaluate the following checkpoint: %s", checkpoint) model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) logger.info("Training/evaluation parameters %s", args) if args.do_train: train_dataset = build_dataset( op.join(args.data_dir, args.train_yaml), tokenizer, args) val_dataset = build_dataset(op.join(args.data_dir, args.val_yaml), tokenizer, args, is_train=False) global_step, avg_loss = train( args, train_dataset, val_dataset, model, tokenizer) logger.info("Training done: total_step = %s, avg loss = %s", global_step, avg_loss) # inference and evaluation if args.do_test or args.do_eval: args = restore_training_settings(args) test_dataset = build_dataset(op.join(args.data_dir, args.test_yaml), tokenizer, args, is_train=False) if args.n_gpu > 1: model = torch.nn.DataParallel(model) if not args.do_eval: predict_file = get_predict_file( checkpoint, test_dataset.yaml_file, args) test(args, test_dataset, model, tokenizer, predict_file) logger.info("Prediction results saved to: {}".format(predict_file)) else: evaluate_file = evaluate(args, test_dataset, model, tokenizer, checkpoint) logger.info( "Evaluation results saved to: {}".format(evaluate_file))