def save_checkpoint(model, tokenizer, args, epoch, global_step):
    checkpoint_dir = op.join(args.output_dir,
                             'checkpoint-{}-{}'.format(epoch, global_step))
    mkdir(checkpoint_dir)
    model_to_save = model.module if hasattr(model, 'module') else model
    save_num = 0
    while (save_num < 10):
        try:
            model_to_save.save_pretrained(checkpoint_dir)
            torch.save(args, op.join(checkpoint_dir, 'training_args.bin'))
            tokenizer.save_pretrained(checkpoint_dir)
            logger.info("Save checkpoint to {}".format(checkpoint_dir))
            break
        except:
            save_num += 1
    if save_num == 10:
        logger.info("Failed to save checkpoint after 10 trails.")
    return checkpoint_dir
Example #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        default='./datasets/coco_ir/',
                        type=str,
                        required=False,
                        help="The input data dir with all required files.")
    parser.add_argument("--img_feat_file",
                        default='/disk2/11811112/Oscar/coco_ir/features.tsv',
                        type=str,
                        required=False,
                        help="The absolute address of the image feature file.")
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or model type. required for training.")
    parser.add_argument(
        "--output_dir",
        default='output/',
        type=str,
        required=False,
        help="The output directory to save checkpoint and test results.")
    parser.add_argument("--loss_type",
                        default='sfmx',
                        type=str,
                        help="Loss function types: support kl, sfmx")
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name.")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name."
    )
    parser.add_argument(
        "--max_seq_length",
        default=70,
        type=int,
        help="The maximum total input sequence length after tokenization. "
        "Sequences longer than this will be truncated, "
        "sequences shorter will be padded."
        "This number is calculated on COCO dataset"
        "If add object detection labels, the suggested length should be 70.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run inference.")
    parser.add_argument(
        "--do_eval",
        action='store_true',
        help="Whether to run performance valuation."
        "do not activate if we want to inference on dataset without gt labels."
    )
    parser.add_argument("--test_split",
                        default='test',
                        type=str,
                        help='data split name.')
    parser.add_argument(
        "--eval_img_keys_file",
        default='',
        type=str,
        help="image key tsv to select a subset of images for evaluation. "
        "This is useful in 5-folds evaluation. The topn index file is not "
        "needed in this case.")
    parser.add_argument(
        "--eval_caption_index_file",
        default='',
        type=str,
        help="index of a list of (img_key, cap_idx) for each image."
        "this is used to perform re-rank using hard negative samples."
        "useful for validation set to monitor the performance during training."
    )
    parser.add_argument(
        "--cross_image_eval",
        action='store_true',
        help=
        "perform cross image inference, ie. each image with all texts from other images."
    )
    parser.add_argument("--add_od_labels",
                        default=False,
                        action='store_true',
                        help="Whether to add object detection labels or not.")
    parser.add_argument("--od_label_type",
                        default='vg',
                        type=str,
                        help="label type, support vg, gt, oid")
    parser.add_argument(
        "--att_mask_type",
        default='CLR',
        type=str,
        help="attention mask type, support ['CL', 'CR', 'LR', 'CLR']"
        "C: caption, L: labels, R: image regions; CLR is full attention by default."
        "CL means attention between caption and labels."
        "please pay attention to the order CLR, which is the default concat order."
    )
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--drop_out",
                        default=0.1,
                        type=float,
                        help="Drop out in BERT.")
    parser.add_argument("--max_img_seq_length",
                        default=50,
                        type=int,
                        help="The maximum total input image sequence length.")
    parser.add_argument("--img_feature_dim",
                        default=2054,
                        type=int,
                        help="The Image Feature Dimension.")
    parser.add_argument("--img_feature_type",
                        default='frcnn',
                        type=str,
                        help="Image feature type.")
    parser.add_argument("--use_img_layernorm",
                        type=int,
                        default=1,
                        help="Normalize image features with bertlayernorm")
    parser.add_argument("--img_layer_norm_eps",
                        default=1e-12,
                        type=float,
                        help="The eps in image feature laynorm layer")
    parser.add_argument("--per_gpu_train_batch_size",
                        default=2,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=2,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--output_mode",
        default='classification',
        type=str,
        help="output mode, support classification or regression.")
    parser.add_argument(
        "--num_labels",
        default=2,
        type=int,
        help="num_labels is 2 for classification and 1 for regression.")
    parser.add_argument(
        "--num_captions_per_img_train",
        default=5,
        type=int,
        help="number of positive matched captions for each training image.")
    parser.add_argument("--num_captions_per_img_val",
                        default=5,
                        type=int,
                        help="number of captions for each testing image.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help="Number of updates steps to accumulate before backward.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial lr.")
    parser.add_argument("--weight_decay",
                        default=0.05,
                        type=float,
                        help="Weight deay.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup.")
    parser.add_argument("--scheduler",
                        default='linear',
                        type=str,
                        help="constant or linear.")
    parser.add_argument("--num_workers",
                        default=4,
                        type=int,
                        help="Workers in dataloader.")
    parser.add_argument("--num_train_epochs",
                        default=20,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="Total number of training steps. Override num_train_epochs.")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=20,
                        help="Log every X steps.")
    parser.add_argument(
        '--save_steps',
        type=int,
        default=-1,
        help="Save checkpoint every X steps. Will also perform evaluatin.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Run evaluation during training at each save_steps.")
    parser.add_argument("--eval_model_dir",
                        type=str,
                        default='./output0320/checkpoint-29-66390/',
                        help="Model directory for evaluation.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA.")
    parser.add_argument('--seed',
                        type=int,
                        default=88,
                        help="random seed for initialization.")
    args = parser.parse_args()

    global logger
    mkdir(args.output_dir)
    logger = setup_logger("vlpretrain", args.output_dir, 0)

    args.device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    set_seed(args.seed, args.n_gpu)
    logger.warning("Device: %s, n_gpu: %s", args.device, args.n_gpu)
    logger.info('output_mode: {}, #Labels: {}'.format(args.output_mode,
                                                      args.num_labels))

    config_class, tokenizer_class = BertConfig, BertTokenizer
    model_class = ImageBertForSequenceClassification
    checkpoint = args.eval_model_dir
    assert op.isdir(checkpoint)
    config = config_class.from_pretrained(checkpoint)
    tokenizer = tokenizer_class.from_pretrained(checkpoint)
    model = model_class.from_pretrained(checkpoint, config=config)

    model.to(args.device)
    # inference and evaluation
    args = restore_training_settings(args)
    test_dataset = RetrievalDataset(tokenizer,
                                    args,
                                    args.test_split,
                                    is_train=False)
    checkpoint = args.eval_model_dir
    assert op.isdir(checkpoint)
    model = model_class.from_pretrained(checkpoint, config=config)
    model.to(args.device)
    print()
    if args.do_test or args.do_eval:
        args = restore_training_settings(args)
        test_dataset = RetrievalDataset(tokenizer,
                                        args,
                                        args.test_split,
                                        is_train=False)
        checkpoint = args.eval_model_dir
        assert op.isdir(checkpoint)
        logger.info("Evaluate the following checkpoint: %s", checkpoint)
        model = model_class.from_pretrained(checkpoint, config=config)
        model.to(args.device)
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)
        result = get_intermediate_data(args, model.module,
                                       test_dataset)  #得到中间数据
        ##test_result = test(args, model, test_dataset)
        mediate_file = op.basename("mediate_file.txt")
        torch.save(str(result), mediate_file)
        logger.info("Prediction results saved to {}.".format(mediate_file))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=False,
                        help="The input data dir. "
                        "Should contain the .yaml files for the task.")
    parser.add_argument("--dataset_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The training dataset yaml file.")
    parser.add_argument("--extra_dataset_file",
                        default=None,
                        type=str,
                        required=False,
                        help="The extra training dataset yaml file.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    # image chunks
    parser.add_argument("--chunk_start_id",
                        default=-1,
                        type=int,
                        help="Image Chunk Start ID")
    parser.add_argument("--chunk_end_id",
                        default=-1,
                        type=int,
                        help="Image Chunk End ID")

    ## Image parameters
    parser.add_argument("--max_img_seq_length",
                        default=50,
                        type=int,
                        help="The maximum total input image sequence length.")
    parser.add_argument("--img_feature_dim",
                        default=2054,
                        type=int,
                        help="The Image Feature Dimension.")
    parser.add_argument("--img_feature_type",
                        default='faster_r-cnn',
                        type=str,
                        help="faster_r-cnn or mask_r-cnn")
    parser.add_argument("--use_layernorm",
                        action='store_true',
                        help="use_layernorm")

    parser.add_argument("--drop_out",
                        default=0.1,
                        type=float,
                        help="Drop out for BERT.")

    parser.add_argument("--use_b", type=int, default=1, help="use_b")
    parser.add_argument("--textb_sample_mode",
                        type=int,
                        default=0,
                        help="0: sample from both texta&textb, "
                        "1: sample from textb, "
                        "2: sample from QA answers")
    parser.add_argument("--extra_textb_sample_mode", type=int, default=1)
    parser.add_argument(
        "--texta_false_prob",
        type=float,
        default=0.0,
        help="the probality that we sample wrong texta, should in [0.0, 0.5]")

    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")

    parser.add_argument(
        "--max_seq_length",
        default=35,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--max_iters",
                        default=2000000,
                        type=int,
                        help="Maximal number of training iterations.")
    parser.add_argument("--train_batch_size",
                        default=1024,
                        type=int,
                        help="Batch size for training.")
    parser.add_argument("--num_workers",
                        default=6,
                        type=int,
                        help="Number of workers for dataset.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument(
        "--optim",
        default='adamw',
        type=str,
        help="The optimizer used for Bert, [adamw, lamb], default: adamw")
    parser.add_argument("--max_grad_norm",
                        default=-1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )

    parser.add_argument("--from_scratch",
                        action='store_true',
                        help="train from scratch")
    parser.add_argument("--use_img_layernorm",
                        type=int,
                        default=0,
                        help="Normalize image features with bertlayernorm")
    parser.add_argument("--img_layer_norm_eps",
                        default=1e-12,
                        type=float,
                        help="The eps in image feature laynorm layer")
    # distributed
    parser.add_argument('--gpu_ids', type=str, default='-1')
    parser.add_argument(
        "--mask_loss_for_unmatched",
        type=int,
        default=1,
        help="masked language model loss for unmatched triplets")
    parser.add_argument(
        "--extra_loss_weight",
        type=float,
        default=0.0,
        help=
        "the loss weight for the extra train data batch (should be in [0,1])")
    parser.add_argument("--use_gtlabels",
                        type=int,
                        default=1,
                        help="use groundtruth labels for text b or not")
    # logging
    parser.add_argument('--ckpt_period',
                        type=int,
                        default=10000,
                        help="Period for saving checkpoint")
    parser.add_argument('--log_period',
                        type=int,
                        default=100,
                        help="Period for saving logging info")
    args = parser.parse_args()

    if args.gpu_ids != '-1':
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids

    args.num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = args.num_gpus > 1

    if args.gpu_ids != '-1':
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        logger.info("Output Directory Exists.")

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method="env://")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if not os.path.exists(args.output_dir):
        mkdir(args.output_dir)

    last_checkpoint_dir = None
    arguments = {"iteration": 0}
    if os.path.exists(args.output_dir):
        save_file = os.path.join(args.output_dir, "last_checkpoint")
        try:
            with open(save_file, "r") as f:
                last_saved = f.read()
                last_saved = last_saved.strip()
        except IOError:
            # if file doesn't exist, maybe because it has just been
            # deleted by a separate process
            last_saved = ""
        if last_saved:
            folder_name = os.path.splitext(
                last_saved.split('/')[0]
            )[0]  # in the form of checkpoint-00001 or checkpoint-00001/pytorch_model.bin
            last_checkpoint_dir = os.path.join(args.output_dir, folder_name)
            arguments["iteration"] = int(folder_name.split('-')[-1])
            assert os.path.isfile(
                os.path.join(last_checkpoint_dir, WEIGHTS_NAME)
            ), "Last_checkpoint detected, but file not found!"

    # model first
    if get_rank() != 0:
        torch.distributed.barrier()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.bert_model]
    if last_checkpoint_dir is not None:  # recovery
        args.model_name_or_path = last_checkpoint_dir
        logger.info(" -> Recovering model from {}".format(last_checkpoint_dir))

    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path, )
    config.img_layer_norm_eps = args.img_layer_norm_eps
    config.use_img_layernorm = args.use_img_layernorm

    # discrete code
    config.img_feature_dim = args.img_feature_dim
    config.img_feature_type = args.img_feature_type
    config.hidden_dropout_prob = args.drop_out
    if args.texta_false_prob < 0.5 and (args.texta_false_prob > 0
                                        or not args.use_b):
        args.num_contrast_classes = 3
    else:
        args.num_contrast_classes = 2
    config.num_contrast_classes = args.num_contrast_classes

    # Prepare model
    # model = BertForPreTraining.from_pretrained(args.bert_model)
    load_num = 0
    while load_num < 10:
        try:
            model = BertImgForPreTraining.from_pretrained(
                args.model_name_or_path,
                from_tf=bool('.ckpt' in args.model_name_or_path),
                config=config)
            break
        except:
            load_num += 1

    # train from scratch
    if args.from_scratch:
        if last_checkpoint_dir is None:
            logger.info("Training from scratch ... ")
            model.apply(model.init_weights)
    total_params = sum(p.numel() for p in model.parameters())
    logger.info('Total Parameters: {}'.format(total_params))

    for key, val in vars(config).items():
        setattr(args, key, val)

    if get_rank() == 0 and args.local_rank != -1:
        torch.distributed.barrier()

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    tb_log_dir = os.path.join(args.output_dir, 'train_logs')
    meters = TensorboardLogger(
        log_dir=tb_log_dir,
        delimiter="  ",
    )

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=args.max_iters)

    if arguments['iteration'] > 0 and os.path.isfile(
            os.path.join(last_checkpoint_dir, 'optimizer.pth')):  # recovery
        logger.info("Load BERT optimizer from {}".format(last_checkpoint_dir))
        optimizer_to_load = torch.load(os.path.join(last_checkpoint_dir,
                                                    'optimizer.pth'),
                                       map_location=torch.device("cpu"))
        optimizer.load_state_dict(optimizer_to_load.pop("optimizer"))
        scheduler.load_state_dict(optimizer_to_load.pop("scheduler"))

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # train_examples = None
    train_dataloaders = make_data_loader(args,
                                         is_distributed=args.distributed,
                                         arguments=arguments)

    if isinstance(train_dataloaders, list):
        train_dataloader = train_dataloaders[0]
    else:
        train_dataloader = train_dataloaders
    train_dataloader_extra = [None] * len(train_dataloader)
    if isinstance(train_dataloaders, list) and len(train_dataloaders) > 1:
        logger.info("Having two train dataloaders!")
        train_dataloader_extra = train_dataloaders[1]
    tokenizer = train_dataloader.dataset.tokenizer

    # torch.backends.cudnn.benchmark = True

    max_iter = len(train_dataloader)
    start_iter = arguments["iteration"]
    logger.info("***** Running training *****")
    logger.info(" Num examples = {}".format(len(train_dataloader.dataset)))
    logger.info("  Instantaneous batch size = %d",
                args.train_batch_size // args.gradient_accumulation_steps)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d",
                max_iter // args.gradient_accumulation_steps)

    log_json = {}

    model.train()
    model.zero_grad()

    clock_started = False
    # Every args.ckpt_period, report train_score and save model
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, (batch, batch_extra) in enumerate(
            zip(train_dataloader, train_dataloader_extra), start_iter):
        if not clock_started:
            start_training_time = time.time()
            end = time.time()
            clock_started = True

        def data_process(mini_batch):
            images, targets, qa_inds = \
                mini_batch[0], mini_batch[1], mini_batch[2]
            targets_transposed = list(zip(*targets))
            input_ids = torch.stack(targets_transposed[0]).to(
                args.device, non_blocking=True)
            input_mask = torch.stack(targets_transposed[1]).to(
                args.device, non_blocking=True)
            segment_ids = torch.stack(targets_transposed[2]).to(
                args.device, non_blocking=True)
            lm_label_ids = torch.stack(targets_transposed[3]).to(
                args.device, non_blocking=True)
            is_next = torch.stack(targets_transposed[4]).to(args.device,
                                                            non_blocking=True)
            is_img_match = torch.stack(targets_transposed[5]).to(
                args.device, non_blocking=True)

            return images, input_ids, input_mask, segment_ids, lm_label_ids, is_next

        images1, input_ids1, input_mask1, segment_ids1, lm_label_ids1, is_next1 \
            = data_process(batch)
        if batch_extra is not None:
            images2, input_ids2, input_mask2, segment_ids2, lm_label_ids2, is_next2 \
                = data_process(batch_extra)

        data_time = time.time() - end

        def forward_backward(images,
                             input_ids,
                             input_mask,
                             segment_ids,
                             lm_label_ids,
                             is_next,
                             loss_weight=1.0):
            # feature as input
            image_features = torch.stack(images).to(args.device,
                                                    non_blocking=True)

            outputs = model(input_ids,
                            segment_ids,
                            input_mask,
                            lm_label_ids,
                            is_next,
                            img_feats=image_features)

            loss = loss_weight * outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()

            return loss.item(), input_ids.size(0)

        start1 = time.time()
        loss1, nb_tr_example1 = forward_backward(images1,
                                                 input_ids1,
                                                 input_mask1,
                                                 segment_ids1,
                                                 lm_label_ids1,
                                                 is_next1,
                                                 loss_weight=1.0 -
                                                 args.extra_loss_weight)
        tr_loss += loss1
        nb_tr_examples += nb_tr_example1
        compute_time1 = time.time() - start1

        loss2, nb_tr_example2 = 0.0, 0
        compute_time2 = 0.0
        if batch_extra is not None:
            start2 = time.time()
            loss2, nb_tr_example2 = forward_backward(
                images2,
                input_ids2,
                input_mask2,
                segment_ids2,
                lm_label_ids2,
                is_next2,
                loss_weight=args.extra_loss_weight)
            tr_loss += loss2
            nb_tr_examples += nb_tr_example2
            compute_time2 = time.time() - start2

        nb_tr_steps += 1
        arguments["iteration"] = step + 1

        if (step + 1) % args.gradient_accumulation_steps == 0:
            # do gradient clipping
            if args.max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
            # do the optimization steps
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            optimizer.zero_grad()

            # measure elapsed time
            batch_time = time.time() - end
            end = time.time()
            metrics_to_log = {
                'time_info': {
                    'compute': batch_time,
                    'data': data_time,
                    'compute1': compute_time1,
                    'compute2': compute_time2
                },
                'batch_metrics': {
                    'loss': loss1 + loss2
                }
            }
            params_to_log = {
                'params': {
                    'bert_lr': optimizer.param_groups[0]["lr"]
                }
            }
            meters.update_metrics(metrics_to_log)
            meters.update_params(params_to_log)

            if args.log_period > 0 and (step + 1) % args.log_period == 0:
                avg_time = meters.meters['time_info']['compute'].global_avg
                eta_seconds = avg_time * (max_iter - step - 1)
                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
                logger.info(
                    meters.delimiter.join([
                        "eta: {eta}",
                        "iter: {iter}",
                        "max mem: {memory:.0f}",
                    ]).format(
                        eta=eta_string,
                        iter=step + 1,
                        memory=torch.cuda.max_memory_allocated() / 1024.0 /
                        1024.0,
                    ) + "\n    " + meters.get_logs(step + 1))

        if (step + 1) == max_iter or (
                step + 1) % args.ckpt_period == 0:  # Save a trained model
            log_json[step + 1] = tr_loss
            train_metrics_total = torch.Tensor(
                [tr_loss, nb_tr_examples, nb_tr_steps]).to(args.device)
            torch.distributed.all_reduce(train_metrics_total)
            # reset metrics
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0

            if get_rank() == 0:
                # report metrics
                train_score_gathered = train_metrics_total[0] / \
                                       train_metrics_total[2]
                logger.info("PROGRESS: {}%".format(
                    round(100 * (step + 1) / max_iter, 4)))
                logger.info("EVALERR: {}%".format(train_score_gathered))
                meters.update_metrics({
                    'epoch_metrics': {
                        'ex_cnt': train_metrics_total[1],
                        'loss': train_score_gathered
                    }
                })
                with open(os.path.join(args.output_dir, 'loss_logs.json'),
                          'w') as fp:
                    json.dump(log_json, fp)

                # save checkpoint
                output_dir = os.path.join(args.output_dir,
                                          'checkpoint-{:07d}'.format(step + 1))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model.module if hasattr(
                    model, 'module'
                ) else model  # Take care of distributed/parallel training
                optimizer_to_save = {
                    "optimizer": optimizer.state_dict(),
                    "scheduler": scheduler.state_dict()
                }

                save_num = 0
                while save_num < 10:
                    try:
                        model_to_save.save_pretrained(output_dir)
                        torch.save(
                            args, os.path.join(output_dir,
                                               'training_args.bin'))
                        tokenizer.save_pretrained(output_dir)
                        torch.save(optimizer_to_save,
                                   os.path.join(output_dir, 'optimizer.pth'))
                        save_file = os.path.join(args.output_dir,
                                                 "last_checkpoint")
                        with open(save_file, "w") as f:
                            f.write(
                                'checkpoint-{:07d}/pytorch_model.bin'.format(
                                    step + 1))
                        break
                    except:
                        save_num += 1
                logger.info("Saving model checkpoint {0} to {1}".format(
                    step + 1, output_dir))

    if clock_started:
        total_training_time = time.time() - start_training_time
    else:
        total_training_time = 0.0
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / max_iter))
    # close the tb logger
    meters.close()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        default='datasets/coco_caption',
                        type=str,
                        required=False,
                        help="The input data dir with all required files.")
    parser.add_argument("--train_yaml",
                        default='train.yaml',
                        type=str,
                        required=False,
                        help="yaml file for training.")
    parser.add_argument("--test_yaml",
                        default='test.yaml',
                        type=str,
                        required=False,
                        help="yaml file for testing.")
    parser.add_argument("--val_yaml",
                        default='val.yaml',
                        type=str,
                        required=False,
                        help="yaml file used for validation during training.")
    parser.add_argument("--model_name_or_path",
                        default=None,
                        type=str,
                        required=False,
                        help="Path to pre-trained model or model type.")
    parser.add_argument(
        "--output_dir",
        default='output/',
        type=str,
        required=False,
        help="The output directory to save checkpoint and test results.")
    parser.add_argument("--loss_type",
                        default='sfmx',
                        type=str,
                        help="Loss function types: support kl, x2, sfmx")
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name.")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name."
    )
    parser.add_argument(
        "--max_seq_length",
        default=70,
        type=int,
        help="The maximum total input sequence length after tokenization. "
        "Sequences longer than this will be truncated, "
        "sequences shorter will be padded.")
    parser.add_argument("--max_seq_a_length",
                        default=40,
                        type=int,
                        help="The maximum sequence length for caption.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run inference.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run evaluation.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument(
        "--mask_prob",
        default=0.15,
        type=float,
        help="Probability to mask input sentence during training.")
    parser.add_argument("--max_masked_tokens",
                        type=int,
                        default=3,
                        help="The max number of masked tokens per sentence.")
    parser.add_argument("--add_od_labels",
                        default=False,
                        action='store_true',
                        help="Whether to add object detection labels or not")
    parser.add_argument(
        "--disable_img_features",
        default=False,
        action='store_true',
        help="Whether to disable image feature in finetuning state or not")
    parser.add_argument(
        '--keep_top_percentage_tag_conf_threshold',
        type=float,
        default=0.3,
        help="Confidence threshold k for keep_top_percengate_tag")
    parser.add_argument(
        '--keep_top_percentage_tag',
        type=float,
        default=1,
        help=
        "Keep input percentage features at inference time given that >= k confidence"
    )
    parser.add_argument("--drop_out",
                        default=0.1,
                        type=float,
                        help="Drop out in BERT.")
    parser.add_argument("--max_img_seq_length",
                        default=50,
                        type=int,
                        help="The maximum total input image sequence length.")
    parser.add_argument("--img_feature_dim",
                        default=2054,
                        type=int,
                        help="The Image Feature Dimension.")
    parser.add_argument("--img_feature_type",
                        default='frcnn',
                        type=str,
                        help="Image feature type.")
    parser.add_argument("--per_gpu_train_batch_size",
                        default=64,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=64,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--output_mode",
        default='classification',
        type=str,
        help="output mode, support classification or regression.")
    parser.add_argument(
        "--num_labels",
        default=2,
        type=int,
        help="num_labels is 2 for classification and 1 for regression.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help="Number of updates steps to accumulate before backward.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial lr.")
    parser.add_argument("--weight_decay",
                        default=0.05,
                        type=float,
                        help="Weight deay.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup.")
    parser.add_argument("--scheduler",
                        default='linear',
                        type=str,
                        help="constant or linear or")
    parser.add_argument("--num_workers",
                        default=4,
                        type=int,
                        help="Workers in dataloader.")
    parser.add_argument("--num_train_epochs",
                        default=40,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="Total number of training steps. Override num_train_epochs.")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=20,
                        help="Log every X steps.")
    parser.add_argument(
        '--save_steps',
        type=int,
        default=-1,
        help="Save checkpoint every X steps. Will also perform evaluatin.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Run evaluation during training at each save_steps.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA.")
    parser.add_argument('--seed',
                        type=int,
                        default=88,
                        help="random seed for initialization.")
    parser.add_argument('--scst',
                        action='store_true',
                        help='Self-critical sequence training')
    # for generation
    parser.add_argument("--eval_model_dir",
                        type=str,
                        default='',
                        help="Model directory for evaluation.")
    parser.add_argument('--max_gen_length',
                        type=int,
                        default=20,
                        help="max length of generated sentences")
    parser.add_argument('--output_hidden_states',
                        action='store_true',
                        help="Turn on for fast decoding")
    parser.add_argument('--num_return_sequences',
                        type=int,
                        default=1,
                        help="repeating times per image")
    parser.add_argument('--num_beams',
                        type=int,
                        default=5,
                        help="beam search width")
    parser.add_argument('--num_keep_best',
                        type=int,
                        default=1,
                        help="number of hypotheses to keep in beam search")
    parser.add_argument('--temperature',
                        type=float,
                        default=1,
                        help="temperature in softmax for sampling")
    parser.add_argument('--top_k',
                        type=int,
                        default=0,
                        help="filter distribution for sampling")
    parser.add_argument('--top_p',
                        type=float,
                        default=1,
                        help="filter distribution for sampling")
    parser.add_argument(
        '--repetition_penalty',
        type=int,
        default=1,
        help=
        "repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)"
    )
    parser.add_argument('--length_penalty',
                        type=int,
                        default=1,
                        help="beam search length penalty")
    # for Constrained Beam Search
    parser.add_argument('--use_cbs',
                        action='store_true',
                        help='Use constrained beam search for decoding')
    parser.add_argument('--min_constraints_to_satisfy',
                        type=int,
                        default=2,
                        help="minimum number of constraints to satisfy")
    args = parser.parse_args()

    global logger

    args.device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()

    output_dir = args.output_dir
    mkdir(output_dir)

    logger = setup_logger("vlpretrain", output_dir, 0)
    logger.warning("Device: %s, n_gpu: %s", args.device, args.n_gpu)
    set_seed(args.seed, args.n_gpu)

    # Load pretrained model and tokenizer
    config_class, model_class, tokenizer_class = BertConfig, BertForImageCaptioning, BertTokenizer
    if args.do_train:
        assert args.model_name_or_path is not None
        config = config_class.from_pretrained(args.config_name if args.config_name else \
                args.model_name_or_path, num_labels=args.num_labels, finetuning_task='image_captioning')
        if args.scst:
            # avoid using too much memory
            config.output_hidden_states = True
        tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name \
                else args.model_name_or_path, do_lower_case=args.do_lower_case)
        config.img_feature_dim = args.img_feature_dim
        config.img_feature_type = args.img_feature_type
        config.hidden_dropout_prob = args.drop_out
        config.loss_type = args.loss_type
        model = model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool('.ckpt' in args.model_name_or_path),
            config=config)
    else:
        checkpoint = args.eval_model_dir
        assert op.isdir(checkpoint)
        config = config_class.from_pretrained(checkpoint)
        config.output_hidden_states = args.output_hidden_states
        tokenizer = tokenizer_class.from_pretrained(checkpoint)
        logger.info("Evaluate the following checkpoint: %s", checkpoint)
        model = model_class.from_pretrained(checkpoint, config=config)

    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)
    if args.do_train:
        train_dataset = build_dataset(op.join(args.data_dir, args.train_yaml),
                                      tokenizer, args)
        val_dataset = build_dataset(op.join(args.data_dir, args.val_yaml),
                                    tokenizer,
                                    args,
                                    is_train=False)
        global_step, avg_loss = train(args, train_dataset, val_dataset, model,
                                      tokenizer)
        logger.info("Training done: total_step = %s, avg loss = %s",
                    global_step, avg_loss)

    # inference and evaluation
    if args.do_test or args.do_eval:
        args = restore_training_settings(args)
        test_dataset = build_dataset(op.join(args.data_dir, args.test_yaml),
                                     tokenizer,
                                     args,
                                     is_train=False)
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        if not args.do_eval:
            predict_file = get_predict_file(checkpoint, test_dataset.yaml_file,
                                            args)
            test(args, test_dataset, model, tokenizer, predict_file)
            logger.info("Prediction results saved to: {}".format(predict_file))
        else:
            evaluate_file = evaluate(args, test_dataset, model, tokenizer,
                                     checkpoint)
            logger.info(
                "Evaluation results saved to: {}".format(evaluate_file))
def main():
    args = get_args()

    global logger
    # global logger, writer

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    if args.do_train:
        mkdir(args.output_dir)

        t = datetime.today()
        args.output_dir = op.join(
            args.output_dir,
            f"{t.month}_{t.day}_{t.hour}_{t.minute}_{t.second}")
        if not op.exists(args.output_dir):
            mkdir(args.output_dir)

        logger = setup_logger("vlpretrain", args.output_dir, args.local_rank)
    else:
        logger = setup_logger("vlpretrain",
                              os.path.dirname(args.eval_model_dir),
                              args.local_rank, 'test_log.txt')

    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))

    set_seed(args.seed, args.n_gpu)

    # writer = SummaryWriter(log_dir=args.output_dir, flush_secs=60)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    config_class, model_class, tokenizer_class = BertConfig, BertForImageCaptioning, BertTokenizer
    if args.do_train:
        assert args.model_name_or_path is not None
        config = config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=args.num_labels,
            finetuning_task='image_captioning')
        if args.scst:
            # avoid using too much memory
            config.output_hidden_states = True
        tokenizer = tokenizer_class.from_pretrained(
            args.tokenizer_name
            if args.tokenizer_name else args.model_name_or_path,
            do_lower_case=args.do_lower_case)
        config.img_feature_dim = args.img_feature_dim
        config.img_feature_type = args.img_feature_type
        config.hidden_dropout_prob = args.drop_out
        config.loss_type = args.loss_type
        model = model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool('.ckpt' in args.model_name_or_path),
            config=config)
    else:
        assert op.isdir(args.eval_model_dir)
        config = config_class.from_pretrained(args.eval_model_dir)
        config.output_hidden_states = args.output_hidden_states
        tokenizer = tokenizer_class.from_pretrained(args.eval_model_dir)
        logger.info("Evaluate the following checkpoint: %s",
                    args.eval_model_dir)
        model = model_class.from_pretrained(args.eval_model_dir, config=config)

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)
    if args.do_train:
        train_dataset = build_dataset('train', tokenizer, args)
        val_dataset = build_dataset('dev', tokenizer, args, is_train=False)
        global_step, avg_loss = train(args, train_dataset, val_dataset, model,
                                      tokenizer)
        logger.info("Training done: total_step = %s, avg loss = %s",
                    global_step, avg_loss)

    # # inference and evaluation
    # if args.do_test or args.do_eval:
    #     args = restore_training_settings(args)
    #     test_dataset = build_dataset('test', tokenizer, args, is_train=False)
    #     if args.n_gpu > 1:
    #         model = torch.nn.DataParallel(model)

    #     if not args.do_eval:
    #         predict_file = get_predict_file('test', args.eval_model_dir, args)
    #         test(args, test_dataset, model, tokenizer, predict_file)
    #         logger.info("Prediction results saved to: {}".format(predict_file))
    #     else:
    #         evaluate_file = evaluate(args, test_dataset, model, tokenizer,
    #                                  args.eval_model_dir)
    #         logger.info(
    #             "Evaluation results saved to: {}".format(evaluate_file))

    if args.do_test and args.local_rank in [-1, 0]:
        args = restore_training_settings(args)
        test_dataset = build_dataset('test', tokenizer, args, is_train=False)
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        predict_file = get_predict_file('test', args.eval_model_dir, args)
        test(args, test_dataset, model, tokenizer, predict_file)
        logger.info("Prediction results saved to: {}".format(predict_file))

    if args.do_eval and args.local_rank in [-1, 0]:
        args = restore_training_settings(args)
        dev_dataset = build_dataset('dev', tokenizer, args, is_train=False)
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        predict_file = get_predict_file('dev', args.eval_model_dir, args)
        test(args, dev_dataset, model, tokenizer, predict_file)
        logger.info("Prediction results saved to: {}".format(predict_file))
Example #6
0
def main():
    args = get_args()

    global logger

    args.device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()

    mkdir(args.output_dir)

    logger = setup_logger("vlpretrain", args.output_dir, 0)
    logger.warning("Device: %s, n_gpu: %s", args.device, args.n_gpu)
    set_seed(args.seed, args.n_gpu)

    # Load pretrained model and tokenizer
    config_class, model_class, tokenizer_class = BertConfig, BertForImageCaptioning, BertTokenizer
    if args.do_train:
        assert args.model_name_or_path is not None
        config = config_class.from_pretrained(args.config_name if args.config_name else
                                              args.model_name_or_path, num_labels=args.num_labels, finetuning_task='image_captioning')
        if args.scst:
            # avoid using too much memory
            config.output_hidden_states = True
        tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name
                                                    else args.model_name_or_path, do_lower_case=args.do_lower_case)
        config.img_feature_dim = args.img_feature_dim
        config.img_feature_type = args.img_feature_type
        config.hidden_dropout_prob = args.drop_out
        config.loss_type = args.loss_type
        model = model_class.from_pretrained(args.model_name_or_path,
                                            from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
    else:
        checkpoint = args.eval_model_dir
        assert op.isdir(checkpoint)
        config = config_class.from_pretrained(checkpoint)
        config.output_hidden_states = args.output_hidden_states
        tokenizer = tokenizer_class.from_pretrained(checkpoint)
        logger.info("Evaluate the following checkpoint: %s", checkpoint)
        model = model_class.from_pretrained(checkpoint, config=config)

    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)
    if args.do_train:
        train_dataset = build_dataset(
            op.join(args.data_dir, args.train_yaml), tokenizer, args)
        val_dataset = build_dataset(op.join(args.data_dir, args.val_yaml),
                                    tokenizer, args, is_train=False)
        global_step, avg_loss = train(
            args, train_dataset, val_dataset, model, tokenizer)
        logger.info("Training done: total_step = %s, avg loss = %s",
                    global_step, avg_loss)

    # inference and evaluation
    if args.do_test or args.do_eval:
        args = restore_training_settings(args)
        test_dataset = build_dataset(op.join(args.data_dir, args.test_yaml),
                                     tokenizer, args, is_train=False)
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        if not args.do_eval:
            predict_file = get_predict_file(
                checkpoint, test_dataset.yaml_file, args)
            test(args, test_dataset, model, tokenizer, predict_file)
            logger.info("Prediction results saved to: {}".format(predict_file))
        else:
            evaluate_file = evaluate(args, test_dataset, model, tokenizer,
                                     checkpoint)
            logger.info(
                "Evaluation results saved to: {}".format(evaluate_file))