def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default='./datasets/coco_ir/', type=str, required=False, help="The input data dir with all required files.") parser.add_argument("--img_feat_file", default='/disk2/11811112/Oscar/coco_ir/features.tsv', type=str, required=False, help="The absolute address of the image feature file.") parser.add_argument( "--model_name_or_path", default=None, type=str, required=False, help="Path to pre-trained model or model type. required for training.") parser.add_argument( "--output_dir", default='output/', type=str, required=False, help="The output directory to save checkpoint and test results.") parser.add_argument("--loss_type", default='sfmx', type=str, help="Loss function types: support kl, sfmx") parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name.") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name." ) parser.add_argument( "--max_seq_length", default=70, type=int, help="The maximum total input sequence length after tokenization. " "Sequences longer than this will be truncated, " "sequences shorter will be padded." "This number is calculated on COCO dataset" "If add object detection labels, the suggested length should be 70.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run inference.") parser.add_argument( "--do_eval", action='store_true', help="Whether to run performance valuation." "do not activate if we want to inference on dataset without gt labels." ) parser.add_argument("--test_split", default='test', type=str, help='data split name.') parser.add_argument( "--eval_img_keys_file", default='', type=str, help="image key tsv to select a subset of images for evaluation. " "This is useful in 5-folds evaluation. The topn index file is not " "needed in this case.") parser.add_argument( "--eval_caption_index_file", default='', type=str, help="index of a list of (img_key, cap_idx) for each image." "this is used to perform re-rank using hard negative samples." "useful for validation set to monitor the performance during training." ) parser.add_argument( "--cross_image_eval", action='store_true', help= "perform cross image inference, ie. each image with all texts from other images." ) parser.add_argument("--add_od_labels", default=False, action='store_true', help="Whether to add object detection labels or not.") parser.add_argument("--od_label_type", default='vg', type=str, help="label type, support vg, gt, oid") parser.add_argument( "--att_mask_type", default='CLR', type=str, help="attention mask type, support ['CL', 'CR', 'LR', 'CLR']" "C: caption, L: labels, R: image regions; CLR is full attention by default." "CL means attention between caption and labels." "please pay attention to the order CLR, which is the default concat order." ) parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--drop_out", default=0.1, type=float, help="Drop out in BERT.") parser.add_argument("--max_img_seq_length", default=50, type=int, help="The maximum total input image sequence length.") parser.add_argument("--img_feature_dim", default=2054, type=int, help="The Image Feature Dimension.") parser.add_argument("--img_feature_type", default='frcnn', type=str, help="Image feature type.") parser.add_argument("--use_img_layernorm", type=int, default=1, help="Normalize image features with bertlayernorm") parser.add_argument("--img_layer_norm_eps", default=1e-12, type=float, help="The eps in image feature laynorm layer") parser.add_argument("--per_gpu_train_batch_size", default=2, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=2, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--output_mode", default='classification', type=str, help="output mode, support classification or regression.") parser.add_argument( "--num_labels", default=2, type=int, help="num_labels is 2 for classification and 1 for regression.") parser.add_argument( "--num_captions_per_img_train", default=5, type=int, help="number of positive matched captions for each training image.") parser.add_argument("--num_captions_per_img_val", default=5, type=int, help="number of captions for each testing image.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before backward.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial lr.") parser.add_argument("--weight_decay", default=0.05, type=float, help="Weight deay.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup.") parser.add_argument("--scheduler", default='linear', type=str, help="constant or linear.") parser.add_argument("--num_workers", default=4, type=int, help="Workers in dataloader.") parser.add_argument("--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help="Total number of training steps. Override num_train_epochs.") parser.add_argument('--logging_steps', type=int, default=20, help="Log every X steps.") parser.add_argument( '--save_steps', type=int, default=-1, help="Save checkpoint every X steps. Will also perform evaluatin.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Run evaluation during training at each save_steps.") parser.add_argument("--eval_model_dir", type=str, default='./output0320/checkpoint-29-66390/', help="Model directory for evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA.") parser.add_argument('--seed', type=int, default=88, help="random seed for initialization.") args = parser.parse_args() global logger mkdir(args.output_dir) logger = setup_logger("vlpretrain", args.output_dir, 0) args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args.seed, args.n_gpu) logger.warning("Device: %s, n_gpu: %s", args.device, args.n_gpu) logger.info('output_mode: {}, #Labels: {}'.format(args.output_mode, args.num_labels)) config_class, tokenizer_class = BertConfig, BertTokenizer model_class = ImageBertForSequenceClassification checkpoint = args.eval_model_dir assert op.isdir(checkpoint) config = config_class.from_pretrained(checkpoint) tokenizer = tokenizer_class.from_pretrained(checkpoint) model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) # inference and evaluation args = restore_training_settings(args) test_dataset = RetrievalDataset(tokenizer, args, args.test_split, is_train=False) checkpoint = args.eval_model_dir assert op.isdir(checkpoint) model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) print() if args.do_test or args.do_eval: args = restore_training_settings(args) test_dataset = RetrievalDataset(tokenizer, args, args.test_split, is_train=False) checkpoint = args.eval_model_dir assert op.isdir(checkpoint) logger.info("Evaluate the following checkpoint: %s", checkpoint) model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) result = get_intermediate_data(args, model.module, test_dataset) #得到中间数据 ##test_result = test(args, model, test_dataset) mediate_file = op.basename("mediate_file.txt") torch.save(str(result), mediate_file) logger.info("Prediction results saved to {}.".format(mediate_file))
def main(): args = get_args() global logger # global logger, writer # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device if args.do_train: mkdir(args.output_dir) t = datetime.today() args.output_dir = op.join( args.output_dir, f"{t.month}_{t.day}_{t.hour}_{t.minute}_{t.second}") if not op.exists(args.output_dir): mkdir(args.output_dir) logger = setup_logger("vlpretrain", args.output_dir, args.local_rank) else: logger = setup_logger("vlpretrain", os.path.dirname(args.eval_model_dir), args.local_rank, 'test_log.txt') logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1)) set_seed(args.seed, args.n_gpu) # writer = SummaryWriter(log_dir=args.output_dir, flush_secs=60) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() config_class, model_class, tokenizer_class = BertConfig, BertForImageCaptioning, BertTokenizer if args.do_train: assert args.model_name_or_path is not None config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=args.num_labels, finetuning_task='image_captioning') if args.scst: # avoid using too much memory config.output_hidden_states = True tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) config.img_feature_dim = args.img_feature_dim config.img_feature_type = args.img_feature_type config.hidden_dropout_prob = args.drop_out config.loss_type = args.loss_type model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) else: assert op.isdir(args.eval_model_dir) config = config_class.from_pretrained(args.eval_model_dir) config.output_hidden_states = args.output_hidden_states tokenizer = tokenizer_class.from_pretrained(args.eval_model_dir) logger.info("Evaluate the following checkpoint: %s", args.eval_model_dir) model = model_class.from_pretrained(args.eval_model_dir, config=config) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) logger.info("Training/evaluation parameters %s", args) if args.do_train: train_dataset = build_dataset('train', tokenizer, args) val_dataset = build_dataset('dev', tokenizer, args, is_train=False) global_step, avg_loss = train(args, train_dataset, val_dataset, model, tokenizer) logger.info("Training done: total_step = %s, avg loss = %s", global_step, avg_loss) # # inference and evaluation # if args.do_test or args.do_eval: # args = restore_training_settings(args) # test_dataset = build_dataset('test', tokenizer, args, is_train=False) # if args.n_gpu > 1: # model = torch.nn.DataParallel(model) # if not args.do_eval: # predict_file = get_predict_file('test', args.eval_model_dir, args) # test(args, test_dataset, model, tokenizer, predict_file) # logger.info("Prediction results saved to: {}".format(predict_file)) # else: # evaluate_file = evaluate(args, test_dataset, model, tokenizer, # args.eval_model_dir) # logger.info( # "Evaluation results saved to: {}".format(evaluate_file)) if args.do_test and args.local_rank in [-1, 0]: args = restore_training_settings(args) test_dataset = build_dataset('test', tokenizer, args, is_train=False) if args.n_gpu > 1: model = torch.nn.DataParallel(model) predict_file = get_predict_file('test', args.eval_model_dir, args) test(args, test_dataset, model, tokenizer, predict_file) logger.info("Prediction results saved to: {}".format(predict_file)) if args.do_eval and args.local_rank in [-1, 0]: args = restore_training_settings(args) dev_dataset = build_dataset('dev', tokenizer, args, is_train=False) if args.n_gpu > 1: model = torch.nn.DataParallel(model) predict_file = get_predict_file('dev', args.eval_model_dir, args) test(args, dev_dataset, model, tokenizer, predict_file) logger.info("Prediction results saved to: {}".format(predict_file))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default='datasets/coco_caption', type=str, required=False, help="The input data dir with all required files.") parser.add_argument("--train_yaml", default='train.yaml', type=str, required=False, help="yaml file for training.") parser.add_argument("--test_yaml", default='test.yaml', type=str, required=False, help="yaml file for testing.") parser.add_argument("--val_yaml", default='val.yaml', type=str, required=False, help="yaml file used for validation during training.") parser.add_argument("--model_name_or_path", default=None, type=str, required=False, help="Path to pre-trained model or model type.") parser.add_argument( "--output_dir", default='output/', type=str, required=False, help="The output directory to save checkpoint and test results.") parser.add_argument("--loss_type", default='sfmx', type=str, help="Loss function types: support kl, x2, sfmx") parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name.") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name." ) parser.add_argument( "--max_seq_length", default=70, type=int, help="The maximum total input sequence length after tokenization. " "Sequences longer than this will be truncated, " "sequences shorter will be padded.") parser.add_argument("--max_seq_a_length", default=40, type=int, help="The maximum sequence length for caption.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run inference.") parser.add_argument("--do_eval", action='store_true', help="Whether to run evaluation.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument( "--mask_prob", default=0.15, type=float, help="Probability to mask input sentence during training.") parser.add_argument("--max_masked_tokens", type=int, default=3, help="The max number of masked tokens per sentence.") parser.add_argument("--add_od_labels", default=False, action='store_true', help="Whether to add object detection labels or not") parser.add_argument( "--disable_img_features", default=False, action='store_true', help="Whether to disable image feature in finetuning state or not") parser.add_argument( '--keep_top_percentage_tag_conf_threshold', type=float, default=0.3, help="Confidence threshold k for keep_top_percengate_tag") parser.add_argument( '--keep_top_percentage_tag', type=float, default=1, help= "Keep input percentage features at inference time given that >= k confidence" ) parser.add_argument("--drop_out", default=0.1, type=float, help="Drop out in BERT.") parser.add_argument("--max_img_seq_length", default=50, type=int, help="The maximum total input image sequence length.") parser.add_argument("--img_feature_dim", default=2054, type=int, help="The Image Feature Dimension.") parser.add_argument("--img_feature_type", default='frcnn', type=str, help="Image feature type.") parser.add_argument("--per_gpu_train_batch_size", default=64, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--output_mode", default='classification', type=str, help="output mode, support classification or regression.") parser.add_argument( "--num_labels", default=2, type=int, help="num_labels is 2 for classification and 1 for regression.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before backward.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial lr.") parser.add_argument("--weight_decay", default=0.05, type=float, help="Weight deay.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup.") parser.add_argument("--scheduler", default='linear', type=str, help="constant or linear or") parser.add_argument("--num_workers", default=4, type=int, help="Workers in dataloader.") parser.add_argument("--num_train_epochs", default=40, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help="Total number of training steps. Override num_train_epochs.") parser.add_argument('--logging_steps', type=int, default=20, help="Log every X steps.") parser.add_argument( '--save_steps', type=int, default=-1, help="Save checkpoint every X steps. Will also perform evaluatin.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Run evaluation during training at each save_steps.") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA.") parser.add_argument('--seed', type=int, default=88, help="random seed for initialization.") parser.add_argument('--scst', action='store_true', help='Self-critical sequence training') # for generation parser.add_argument("--eval_model_dir", type=str, default='', help="Model directory for evaluation.") parser.add_argument('--max_gen_length', type=int, default=20, help="max length of generated sentences") parser.add_argument('--output_hidden_states', action='store_true', help="Turn on for fast decoding") parser.add_argument('--num_return_sequences', type=int, default=1, help="repeating times per image") parser.add_argument('--num_beams', type=int, default=5, help="beam search width") parser.add_argument('--num_keep_best', type=int, default=1, help="number of hypotheses to keep in beam search") parser.add_argument('--temperature', type=float, default=1, help="temperature in softmax for sampling") parser.add_argument('--top_k', type=int, default=0, help="filter distribution for sampling") parser.add_argument('--top_p', type=float, default=1, help="filter distribution for sampling") parser.add_argument( '--repetition_penalty', type=int, default=1, help= "repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)" ) parser.add_argument('--length_penalty', type=int, default=1, help="beam search length penalty") # for Constrained Beam Search parser.add_argument('--use_cbs', action='store_true', help='Use constrained beam search for decoding') parser.add_argument('--min_constraints_to_satisfy', type=int, default=2, help="minimum number of constraints to satisfy") args = parser.parse_args() global logger args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() output_dir = args.output_dir mkdir(output_dir) logger = setup_logger("vlpretrain", output_dir, 0) logger.warning("Device: %s, n_gpu: %s", args.device, args.n_gpu) set_seed(args.seed, args.n_gpu) # Load pretrained model and tokenizer config_class, model_class, tokenizer_class = BertConfig, BertForImageCaptioning, BertTokenizer if args.do_train: assert args.model_name_or_path is not None config = config_class.from_pretrained(args.config_name if args.config_name else \ args.model_name_or_path, num_labels=args.num_labels, finetuning_task='image_captioning') if args.scst: # avoid using too much memory config.output_hidden_states = True tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name \ else args.model_name_or_path, do_lower_case=args.do_lower_case) config.img_feature_dim = args.img_feature_dim config.img_feature_type = args.img_feature_type config.hidden_dropout_prob = args.drop_out config.loss_type = args.loss_type model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) else: checkpoint = args.eval_model_dir assert op.isdir(checkpoint) config = config_class.from_pretrained(checkpoint) config.output_hidden_states = args.output_hidden_states tokenizer = tokenizer_class.from_pretrained(checkpoint) logger.info("Evaluate the following checkpoint: %s", checkpoint) model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) logger.info("Training/evaluation parameters %s", args) if args.do_train: train_dataset = build_dataset(op.join(args.data_dir, args.train_yaml), tokenizer, args) val_dataset = build_dataset(op.join(args.data_dir, args.val_yaml), tokenizer, args, is_train=False) global_step, avg_loss = train(args, train_dataset, val_dataset, model, tokenizer) logger.info("Training done: total_step = %s, avg loss = %s", global_step, avg_loss) # inference and evaluation if args.do_test or args.do_eval: args = restore_training_settings(args) test_dataset = build_dataset(op.join(args.data_dir, args.test_yaml), tokenizer, args, is_train=False) if args.n_gpu > 1: model = torch.nn.DataParallel(model) if not args.do_eval: predict_file = get_predict_file(checkpoint, test_dataset.yaml_file, args) test(args, test_dataset, model, tokenizer, predict_file) logger.info("Prediction results saved to: {}".format(predict_file)) else: evaluate_file = evaluate(args, test_dataset, model, tokenizer, checkpoint) logger.info( "Evaluation results saved to: {}".format(evaluate_file))
def main(): args = get_args() global logger args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() mkdir(args.output_dir) logger = setup_logger("vlpretrain", args.output_dir, 0) logger.warning("Device: %s, n_gpu: %s", args.device, args.n_gpu) set_seed(args.seed, args.n_gpu) # Load pretrained model and tokenizer config_class, model_class, tokenizer_class = BertConfig, BertForImageCaptioning, BertTokenizer if args.do_train: assert args.model_name_or_path is not None config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=args.num_labels, finetuning_task='image_captioning') if args.scst: # avoid using too much memory config.output_hidden_states = True tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) config.img_feature_dim = args.img_feature_dim config.img_feature_type = args.img_feature_type config.hidden_dropout_prob = args.drop_out config.loss_type = args.loss_type model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) else: checkpoint = args.eval_model_dir assert op.isdir(checkpoint) config = config_class.from_pretrained(checkpoint) config.output_hidden_states = args.output_hidden_states tokenizer = tokenizer_class.from_pretrained(checkpoint) logger.info("Evaluate the following checkpoint: %s", checkpoint) model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) logger.info("Training/evaluation parameters %s", args) if args.do_train: train_dataset = build_dataset( op.join(args.data_dir, args.train_yaml), tokenizer, args) val_dataset = build_dataset(op.join(args.data_dir, args.val_yaml), tokenizer, args, is_train=False) global_step, avg_loss = train( args, train_dataset, val_dataset, model, tokenizer) logger.info("Training done: total_step = %s, avg loss = %s", global_step, avg_loss) # inference and evaluation if args.do_test or args.do_eval: args = restore_training_settings(args) test_dataset = build_dataset(op.join(args.data_dir, args.test_yaml), tokenizer, args, is_train=False) if args.n_gpu > 1: model = torch.nn.DataParallel(model) if not args.do_eval: predict_file = get_predict_file( checkpoint, test_dataset.yaml_file, args) test(args, test_dataset, model, tokenizer, predict_file) logger.info("Prediction results saved to: {}".format(predict_file)) else: evaluate_file = evaluate(args, test_dataset, model, tokenizer, checkpoint) logger.info( "Evaluation results saved to: {}".format(evaluate_file))