def build_dataloader(dataset, collate_fn, is_train, opts): batch_size = opts.train_batch_size if is_train else opts.val_batch_size if is_train: sampler = TokenBucketSampler( dataset.lens, bucket_size=BUCKET_SIZE, batch_size=batch_size, droplast=is_train, ) dataloader = DataLoader( dataset, batch_sampler=sampler, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=collate_fn, ) else: dataloader = DataLoader( dataset, batch_size=batch_size, num_workers=opts.n_workers, shuffle=False, pin_memory=opts.pin_mem, collate_fn=collate_fn, ) dataloader = PrefetchLoader(dataloader) return dataloader
def create_dataloader(img_path, txt_path, batch_size, is_train, dset_cls, collate_fn, opts): img_db_type = "gt" if "coco_gt" in img_path else "det" conf_th = -1 if img_db_type == "gt" else opts.conf_th num_bb = 100 if img_db_type == "gt" else opts.num_bb img_db = DetectFeatLmdb(img_path, conf_th, opts.max_bb, opts.min_bb, num_bb, opts.compressed_db) txt_db = ReTxtTokLmdb(txt_path, opts.max_txt_len if is_train else -1) if is_train: dset = dset_cls(txt_db, img_db) else: dset = dset_cls(txt_db, img_db, use_gt_feat=img_db_type == "gt") batch_size = (opts.train_batch_size if is_train else opts.val_batch_size) sampler = DistributedSampler(dset, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False) dataloader = DataLoader(dset, sampler=sampler, batch_size=batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=collate_fn) dataloader = PrefetchLoader(dataloader) return dataloader
def build_dataloader(dataset, batch_size, collate_fn, is_train, opts): loader = DataLoader(dataset, batch_size=batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=collate_fn, shuffle=is_train) return PrefetchLoader(loader)
def create_dataloader(opts, dataset_cls, collate_fn, mode='train'): assert mode in ['train', 'val', 'test'] if mode == 'train': image_set = opts.train_image_set batch_size = opts.train_batch_size elif mode == 'val': image_set = opts.val_image_set batch_size = opts.val_batch_size else: image_set = opts.test_image_set batch_size = opts.val_batch_size dataset = dataset_cls(image_set, opts.root_path, opts.dataset_path, use_img_type=opts.use_img_type, test_mode=(mode == 'test')) sampler = DistributedTokenBucketSampler(hvd.size(), hvd.rank(), dataset.lens, bucket_size=BUCKET_SIZE, batch_size=batch_size, droplast=False, shuffle=(mode == 'train')) loader = DataLoader(dataset, batch_sampler=sampler, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=collate_fn) return PrefetchLoader(loader)
def build_dataloader(dataset, collate_fn, is_train, opts): batch_size = opts.train_batch_size if is_train else 1 dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=is_train, drop_last=is_train, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=collate_fn) dataloader = PrefetchLoader(dataloader) return dataloader
def main(opts): hvd.init() device = torch.device("cuda") # support single GPU only train_opts = Struct(json.load(open(f'{opts.train_dir}/log/hps.json'))) if 'paired' in train_opts.model: EvalDatasetCls = Nlvr2PairedEvalDataset eval_collate_fn = nlvr2_paired_eval_collate if train_opts.model == 'paired': ModelCls = UniterForNlvr2Paired elif train_opts.model == 'paired-attn': ModelCls = UniterForNlvr2PairedAttn else: raise ValueError('unrecognized model type') elif train_opts.model == 'triplet': EvalDatasetCls = Nlvr2TripletEvalDataset ModelCls = UniterForNlvr2Triplet eval_collate_fn = nlvr2_triplet_eval_collate else: raise ValueError('unrecognized model type') img_db = DetectFeatLmdb(opts.img_db, train_opts.conf_th, train_opts.max_bb, train_opts.min_bb, train_opts.num_bb, opts.compressed_db) txt_db = TxtTokLmdb(opts.txt_db, -1) dset = EvalDatasetCls(txt_db, img_db, train_opts.use_img_type) batch_size = (train_opts.val_batch_size if opts.batch_size is None else opts.batch_size) sampler = TokenBucketSampler(dset.lens, bucket_size=BUCKET_SIZE, batch_size=batch_size, droplast=False) eval_dataloader = DataLoader(dset, batch_sampler=sampler, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=eval_collate_fn) eval_dataloader = PrefetchLoader(eval_dataloader) # Prepare model ckpt_file = f'{opts.train_dir}/ckpt/model_step_{opts.ckpt}.pt' checkpoint = torch.load(ckpt_file) model_config = UniterConfig.from_json_file( f'{opts.train_dir}/log/model.json') model = ModelCls(model_config, img_dim=IMG_DIM) model.init_type_embedding() model.load_state_dict(checkpoint, strict=False) model.to(device) model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') results = evaluate(model, eval_dataloader, device) # write results if not exists(opts.output_dir): os.makedirs(opts.output_dir) with open(f'{opts.output_dir}/results.csv', 'w') as f: for id_, ans in results: f.write(f'{id_},{ans}\n') print(f'all results written')
def create_dataloaders(datasets, is_train, opts, all_img_dbs=None): all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) dataloaders = {} for dset in datasets: if is_train: txt_path = opts.train_txt_dbs img_path = opts.train_img_dbs else: txt_path = opts.val_txt_dbs img_path = opts.val_img_dbs for i, t in enumerate(dset['tasks']): task = f'{t}_{dset["name"]}' if is_train: LOGGER.info(f"Loading {task} train dataset " f"{dset['db']}, {dset['img']}") else: LOGGER.info(f"Loading {task} validation dataset, " f"{dset['db']}, {dset['img']}") if task.startswith('mlm'): dataset = build_mlm_dataset(txt_path, img_path, all_img_dbs, is_train, opts) elif task.startswith('mrfr'): dataset = build_mrfr_dataset(txt_path, img_path, all_img_dbs, is_train, opts) elif task.startswith('mrckl'): dataset = build_mrc_dataset(txt_path, img_path, all_img_dbs, is_train, opts) elif task.startswith('itm'): dataset = build_itm_dataset(txt_path, img_path, all_img_dbs, is_train, opts) elif task.startswith('itkm'): dataset = build_itkm_dataset(txt_path, img_path, all_img_dbs, is_train, opts) elif task.startswith('mkm'): dataset = build_mkm_dataset(txt_path, img_path, all_img_dbs, is_train, opts) else: raise ValueError(f'Undefined task {task}') LOGGER.info(f"{len(dataset[0])*hvd.size()} samples loaded") if task.startswith('itm'): # itm handles distributed training in dset not sampler loader = build_dataloader_itm(*dataset, is_train, opts) else: loader = build_dataloader(*dataset, is_train, opts) if is_train: ratio = dset['mix_ratio'][i] dataloaders[task] = (loader, ratio) else: dataloaders[task] = PrefetchLoader(loader) return dataloaders, all_img_dbs
def create_dataloaders(datasets, is_train, opts, all_img_dbs=None): # opts.conf_th : 0.2 # opts.min_bb : 10 # opts.num_bb 36 if all_img_dbs is None: all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) dataloaders = {} for dset in datasets: if is_train: assert len(dset['db']) == len(dset['img']) assert len(dset['tasks']) == len(dset['mix_ratio']) img_db = [all_img_dbs[path] for path in dset['img']] else: assert len(dset['db']) == len(dset['img']) == 1 img_db = all_img_dbs[dset['img'][0]] for i, t in enumerate(dset['tasks']): task = f'{t}_{dset["name"]}' if is_train: LOGGER.info(f"Loading {task} train dataset " f"{dset['db']}, {[img.img_dir for img in img_db]}") txt_db = [ TxtTokLmdb(path, opts.max_txt_len) for path in dset['db'] ] else: LOGGER.info(f"Loading {task} validation dataset, " f"{dset['db']}, {img_db.img_dir}") txt_db = TxtTokLmdb(dset['db'][0], -1) if task.startswith('mlm'): dataset = build_mlm_dataset(txt_db, img_db, is_train, opts) elif task.startswith('mrfr'): dataset = build_mrfr_dataset(txt_db, img_db, is_train, opts) elif task.startswith('mrc'): dataset = build_mrc_dataset(txt_db, img_db, is_train, opts) elif task.startswith('itm'): dataset = build_itm_dataset(txt_db, img_db, is_train, opts) else: raise ValueError(f'Undefined task {task}') LOGGER.info(f"{len(dataset[0])*hvd.size()} samples loaded") if task.startswith('itm'): # itm handles distributed training in dset not sampler loader = build_dataloader_itm(*dataset, is_train, opts) else: loader = build_dataloader(*dataset, is_train, opts) if is_train: ratio = dset['mix_ratio'][i] dataloaders[task] = (loader, ratio) else: dataloaders[task] = PrefetchLoader(loader) return dataloaders, all_img_dbs
def create_dataloader(img_path, txt_path, batch_size, is_train, dset_cls, collate_fn, opts): img_db = DetectFeatLmdb(img_path, opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) txt_db = TxtTokLmdb(txt_path, opts.max_txt_len if is_train else -1) dset = dset_cls(txt_db, img_db, opts.use_img_type) sampler = TokenBucketSampler(dset.lens, bucket_size=BUCKET_SIZE, batch_size=batch_size, droplast=is_train) loader = DataLoader(dset, batch_sampler=sampler, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=collate_fn) return PrefetchLoader(loader)
def main(opts): hvd.init() n_gpu = hvd.size() print('fasfafs: ', n_gpu) device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.train_config is not None: train_opts = Struct(json.load(open(opts.train_config))) opts.conf_th = train_opts.conf_th opts.max_bb = train_opts.max_bb opts.min_bb = train_opts.min_bb opts.num_bb = train_opts.num_bb # load DBs and image dirs eval_img_db = DetectFeatLmdb(opts.img_db, opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) eval_txt_db = TxtTokLmdb(opts.txt_db, -1) eval_dataset = ItmEvalDataset(eval_txt_db, eval_img_db, opts.batch_size) # Prepare model checkpoint = torch.load(opts.checkpoint) model = UniterForImageTextRetrieval.from_pretrained(opts.model_config, checkpoint, img_dim=IMG_DIM) if 'rank_output' not in checkpoint: model.init_output() # zero shot setting model.to(device) model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=1, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=itm_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) eval_log, results = evaluate(model, eval_dataloader)
def build_dataloader(opts): # Load ground truth, query db and video db hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(load_json(hps_file)) video_ids = get_video_ids(opts.query_txt_db) video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, model_opts.vfeat_interval, model_opts) assert opts.split in opts.query_txt_db, (opts.split, opts.query_txt_db) q_txt_db = QueryTokLmdb(opts.query_txt_db, -1) eval_dataset = VcmrFullEvalDataset(video_ids, video_db, q_txt_db, distributed=model_opts.distributed_eval) eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) return eval_dataloader
def build_dataloader(dataset, collate_fn, is_train, opts): batch_size = (opts.train_batch_size if is_train else opts.val_batch_size) if is_train: train_sampler = WeightedRandomSampler(dataset.weights_by_class, len(dataset), replacement=True) dataloader = DataLoader(dataset, sampler=train_sampler, num_workers=opts.n_workers, batch_size=32, pin_memory=opts.pin_mem, collate_fn=collate_fn) else: sampler = TokenBucketSampler(dataset.lens, bucket_size=BUCKET_SIZE, batch_size=batch_size, droplast=is_train) dataloader = DataLoader(dataset, batch_sampler=sampler, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=collate_fn) dataloader = PrefetchLoader(dataloader) return dataloader
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), opts.fp16)) device = torch.device("cuda:1") if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) if hvd.rank() == 0: TB_LOGGER.create(join(opts.output_dir, 'log')) os.makedirs(join(opts.output_dir, 'ckpt')) save_training_meta(opts) # TB_LOGGER.create(join(opts.output_dir, 'log')) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) # store ITM predictions os.makedirs(join(opts.output_dir, 'results_val')) os.makedirs(join(opts.output_dir, 'results_test')) os.makedirs(join(opts.output_dir, 'results_train')) else: LOGGER.disabled = True model_saver = NoOp() # load DBs and image dirs all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) # train train_dataset = MemeAIDataset(json_path = '/home/data/meme_json/train.json', npz_folder = '/home/data/faster_cnn_feature/', mode = 'train') train_loader = DataLoader(train_dataset, batch_size = opts.train_batch_size, shuffle = True, num_workers = opts.n_workers, collate_fn=collate_fn) train_loader = PrefetchLoader(train_loader) # val val_dataset = MemeAIDataset(json_path = '/home/data/meme_json/dev.json', npz_folder = '/home/data/faster_cnn_feature/', mode = 'val') val_loader = DataLoader(val_dataset, batch_size = opts.inf_minibatch_size, shuffle = False, num_workers = opts.n_workers, collate_fn=collate_fn) val_loader = PrefetchLoader(val_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = Meme.from_pretrained( opts.model_config, state_dict=checkpoint, img_dim=IMG_DIM) model.init_output() # pretrain ITM head is different from ranking head model.to(device) # make sure every process has same model parameters in the beginning # broadcast_tensors([p.data for p in model.parameters()], 0) # set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level='O2') global_step = 0 # LOGGER.info(f"***** Running training on {n_gpu} GPUs *****") # LOGGER.info(" Num examples = %d", len(train_dataset) * hvd.size()) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter('loss') model.train() n_examples = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() # while True: for epoch in range(opts.epoch): print('epoch {}/ {}'.format(epoch, opts.epoch)) pbar = tqdm(total=len(train_loader)) model.train() preds = None gt = None for step, batch in enumerate(train_loader): x = batch[0] y = batch[1] n_examples += x['input_ids'].size(0) pred = model(x) if preds is None: preds = torch.sigmoid(pred) gt = y else: preds = torch.cat((preds, torch.sigmoid(pred)), dim = 0) gt = torch.cat((gt, y), dim = 0) loss = F.binary_cross_entropy(torch.sigmoid(pred), y) delay_unscale = (step+1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale ) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() with torch.no_grad(): preds = preds.cpu().numpy().reshape(len(preds), ) gt = gt.cpu().numpy() roc = roc_auc_score(gt, preds) acc = accuracy_score(gt, np.around(preds)) train_log = {'train/roc': roc, 'train/acc': acc} TB_LOGGER.log_scaler_dict({f"train/{k}": v for k, v in train_log.items()}) # monitor training throughput val_log = validate(model, val_loader) TB_LOGGER.log_scaler_dict({f"valid/{k}": v for k, v in val_log.items()}) LOGGER.info(train_log) LOGGER.info(val_log) model_saver.save(model, global_step) pbar.close()
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(json.load(open(hps_file))) model_config = f'{opts.output_dir}/log/model_config.json' # load DBs and image dirs video_ids = get_video_ids(opts.query_txt_db) video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, model_opts.vfeat_interval, model_opts) assert opts.split in opts.query_txt_db q_txt_db = QaQueryTokLmdb(opts.query_txt_db, -1) eval_dataset = ViolinEvalDataset(video_ids, video_db, q_txt_db, sampled_by_q=model_opts.sampled_by_q) collate_fn = violin_eval_collate # Prepare model if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" assert img_pos_embed_weight_key in checkpoint max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) model = HeroForViolin.from_pretrained(model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=collate_fn) eval_dataloader = PrefetchLoader(eval_dataloader) _, results, logits = validate_violin(model, eval_dataloader, opts.split, opts.save_logits) result_dir = f'{opts.output_dir}/results_{opts.split}' if opts.save_logits: result_dir += '_w_logit' if not exists(result_dir) and hvd.rank() == 0: os.makedirs(result_dir) all_results = {} for id2res in all_gather_list(results): all_results.update(id2res) if opts.save_logits: all_logits = {} for id2logit in all_gather_list(logits): all_logits.update(id2logit) if hvd.rank() == 0: save_json(all_results, f'{result_dir}/results_{opts.checkpoint}_all.json') LOGGER.info('All results written......') if opts.save_logits: save_pickle(all_logits, f'{result_dir}/logits_{opts.checkpoint}_all.pkl') LOGGER.info('All logits written......')
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) opts.n_gpu = n_gpu LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True set_random_seed(opts.seed) # train_examples = None LOGGER.info(f"Loading the whole video dataset {opts.sub_txt_db}, " f"{opts.vfeat_db}") video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, opts.vfeat_interval, opts) # data loaders # train LOGGER.info(f"Loading the train QA dataset {opts.train_query_txt_db}") video_ids = get_video_ids(opts.train_query_txt_db) train_q_txt_db = QaQueryTokLmdb(opts.train_query_txt_db, opts.max_txt_len) train_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, True, opts, q_txt_db=train_q_txt_db, shuffle=True) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # val LOGGER.info(f"Loading the val QA dataset {opts.val_query_txt_db}") video_ids = get_video_ids(opts.val_query_txt_db) val_q_txt_db = QaQueryTokLmdb(opts.val_query_txt_db, -1) val_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, False, opts, q_txt_db=val_q_txt_db) if opts.test_query_txt_db: LOGGER.info(f"Loading the test QA dataset {opts.test_query_txt_db}") video_ids = get_video_ids(opts.test_query_txt_db) test_q_txt_db = QaQueryTokLmdb(opts.test_query_txt_db, -1) test_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, False, opts, q_txt_db=test_q_txt_db) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" if img_pos_embed_weight_key in checkpoint: max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) else: max_frm_seq_len = MAX_FRM_SEQ_LEN model = HeroForVideoQA.from_pretrained(opts.model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') restorer = TrainingRestorer(opts, model, optimizer) global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) if not exists(join(opts.output_dir, 'results')): # store tvqa predictions os.makedirs(join(opts.output_dir, 'results')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) task2loss = { task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys() } for obj in (f'{opts.task}_qa', f'{opts.task}_st_ed'): task2loss[obj] = RunningMeter(f'loss/{obj}') model.train() n_examples = defaultdict(int) start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() if global_step == 0: optimizer.step() for step, (task, batch) in enumerate(meta_loader): n_examples[task] += opts.train_batch_size loss = model(batch, task=task, compute_loss=True) loss_qa, loss_st_ed = loss loss = loss_qa + opts.lw_st_ed * loss_st_ed for n, ls in (('st_ed', loss_st_ed), ('qa', loss_qa)): ls = ls.item() task2loss[f'{task}_{n}'](ls) loss = loss.mean() task2loss[task](loss.item()) delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[task]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for i, param_group in enumerate(optimizer.param_groups): if i == 0 or i == 1: param_group['lr'] = lr_this_step * opts.lr_mul elif i == 2 or i == 3: param_group['lr'] = lr_this_step else: raise ValueError() TB_LOGGER.add_scalar('lr', lr_this_step, global_step) TB_LOGGER.log_scaler_dict({ temp_loss.name: temp_loss.val for temp_loss in task2loss.values() if temp_loss.val is not None }) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() restorer.step() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info('-------------------------------------------') LOGGER.info(f'Step {global_step}:') for t in train_dataloaders.keys(): tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) if global_step % opts.valid_steps == 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, "val", opts, global_step=global_step) if opts.test_query_txt_db: validate(model, test_dataloaders, "test", opts, global_step=global_step) LOGGER.info('===========================================') model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break LOGGER.info('===========================================') if global_step % opts.valid_steps != 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, "val", opts, global_step=global_step) if opts.test_query_txt_db: validate(model, test_dataloaders, "test", opts, global_step=global_step) LOGGER.info('===========================================') model_saver.save(model, f'{global_step}_final')
def create_dataloaders(datasets, is_train, opts, all_img_dbs=None): if all_img_dbs is None: all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) dataloaders = {} for dset in datasets: if is_train: assert len(dset['db']) == len(dset['img']) assert len(dset['tasks']) == len(dset['mix_ratio']) img_db = [all_img_dbs[path] for path in dset['img']] else: assert len(dset['db']) == len(dset['img']) == 1 img_db = all_img_dbs[dset['img'][0]] for i, t in enumerate(dset['tasks']): task = f'{t}_{dset["name"]}' if is_train: LOGGER.info(f"Loading {task} train dataset " f"{dset['db']}, {[img.img_dir for img in img_db]}") txt_db = [TxtTokLmdb(path, opts.max_txt_len) for path in dset['db']] language_list = [] #only get the language_list from 'cc' if dset['name'] == 'cc' and opts.multilingual_vmlm and task.startswith('vmlm'): for path in dset['db']: language = path.split('_')[-2] #Hacky Way to get the language, Need a better mechanism language_list.append(language) else: LOGGER.info(f"Loading {task} validation dataset, " f"{dset['db']}, {img_db.img_dir}") txt_db = TxtTokLmdb(dset['db'][0], -1) language_list = [] if opts.multilingual_vmlm and task.startswith('vmlm'): lan = dset["name"].split('_')[-1] language_list.append(lan) if task.startswith('mlm'): blind = 'blind' in task dataset = build_mlm_dataset(txt_db, img_db, blind, is_train, opts) elif task.startswith('tlm'): blind = 'blind' in task text_only = "ni" in task dataset = build_tlm_dataset(txt_db, img_db, blind, is_train, opts, text_only) elif task.startswith('mmxlm'): if 'soft' in task: soft = True else: soft = False dataset = build_mmxlm_dataset(txt_db, img_db, is_train, opts, soft) elif task.startswith('vmlm'): if 'soft' in task: soft = True #load the img_soft_label assert dset.get('img_token_soft_label', None) is not None else: soft = False if is_train: if soft: assert len(dset['db']) == len(dset['img_token_soft_label']) img_token_sl_db = [Img_SoftLabel_Lmdb(path) for path in dset['img_token_soft_label']] else: img_token_sl_db = None else: if soft: assert len(dset['db']) == len(dset['img_token_soft_label']) == 1 img_token_sl_db = Img_SoftLabel_Lmdb(dset['img_token_soft_label'][0]) else: img_token_sl_db = None #print(language_list) dataset = build_vmlm_dataset(txt_db, img_db, img_token_sl_db, is_train, opts, soft, language_list=language_list) elif task.startswith('mrfr'): only_i = 'only_i' in task dataset = build_mrfr_dataset(txt_db, img_db, only_i, is_train, opts) elif task.startswith('mrm-nce'): only_i = 'only_i' in task dataset = build_mrm_nce_dataset(txt_db, img_db, only_i, is_train, opts) elif task.startswith('mrc'): only_i = 'only_i' in task dataset = build_mrc_dataset(txt_db, img_db, only_i, is_train, opts) elif task.startswith('itm'): dataset = build_itm_dataset(txt_db, img_db, is_train, opts) else: raise ValueError(f'Undefined task {task}') LOGGER.info(f"{len(dataset[0])*hvd.size()} samples loaded") if task.startswith('itm'): # itm handles distributed training in dset not sampler loader = build_dataloader_itm(*dataset, is_train, opts) else: loader = build_dataloader(*dataset, is_train, opts) if is_train: ratio = dset['mix_ratio'][i] dataloaders[task] = (loader, ratio) else: dataloaders[task] = PrefetchLoader(loader) return dataloaders, all_img_dbs
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), opts.fp16)) hps_file = f'{opts.output_dir}/log/hps.json' model_opts = json.load(open(hps_file)) if 'mlp' not in model_opts: model_opts['mlp'] = 1 model_opts = Struct(model_opts) # Prepare model if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_epoch_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) model = UniterForReferringExpressionComprehension.from_pretrained( f'{opts.output_dir}/log/model.json', checkpoint, img_dim=IMG_DIM, mlp=model_opts.mlp) model.to(device) hvd.broadcast_parameters(model.state_dict(), root_rank=0) if opts.fp16: model = amp.initialize(model, enabled=True, opt_level='O2') # load DBs and image dirs img_db_type = "gt" if "coco_gt" in opts.img_db else "det" conf_th = -1 if img_db_type == "gt" else model_opts.conf_th num_bb = 100 if img_db_type == "gt" else model_opts.num_bb eval_img_db = DetectFeatLmdb(opts.img_db, conf_th, model_opts.max_bb, model_opts.min_bb, num_bb, opts.compressed_db) # Prepro txt_dbs txt_dbs = opts.txt_db.split(':') for txt_db in txt_dbs: print(f'Evaluating {txt_db}') eval_txt_db = ReTxtTokLmdb(txt_db, -1) eval_dataset = ReEvalDataset( eval_txt_db, eval_img_db, use_gt_feat=img_db_type == "gt") sampler = DistributedSampler(eval_dataset, num_replicas=n_gpu, rank=rank, shuffle=False) eval_dataloader = DataLoader(eval_dataset, sampler=sampler, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=re_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) # evaluate val_log, results = evaluate(model, eval_dataloader) result_dir = f'{opts.output_dir}/results_test' if not exists(result_dir) and rank == 0: os.makedirs(result_dir) write_to_tmp( f"{txt_db.split('_')[1].split('.')[0]}-acc({img_db_type}): {results['acc']*100:.2f}% ", args.tmp_file) all_results = list(concat(all_gather_list(results))) if hvd.rank() == 0: db_split = txt_db.split('/')[-1].split('.')[0] # refcoco+_val img_dir = opts.img_db.split('/')[-1] # re_coco_gt with open(f'{result_dir}/' f'results_{opts.checkpoint}_{db_split}_on_{img_dir}_all.json', 'w') as f: json.dump(all_results, f) # print print(f'{opts.output_dir}/results_test') write_to_tmp(f'\n', args.tmp_file)
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) opts.n_gpu = n_gpu LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True set_random_seed(opts.seed) # data loaders train_dataloaders = {} val_dataloaders = {} for target, t_r in zip(opts.targets, opts.targets_ratio): train_loaders, val_loaders = build_target_loaders( target, t_r, opts) # -> choose which task and get corrsponding task dataloder train_dataloaders.update(train_loaders) val_dataloaders.update(val_loaders) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" if img_pos_embed_weight_key in checkpoint: max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) else: max_frm_seq_len = MAX_FRM_SEQ_LEN if opts.load_partial_pretrained: # from roberta model = HeroForPretraining(VideoModelConfig(opts.model_config), vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=opts.lw_neg_ctx, lw_neg_q=opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=opts.hard_pool_size, margin=opts.margin, use_all_neg=opts.use_all_neg, drop_svmr_prob=opts.drop_svmr_prob) model.load_partial_pretrained(checkpoint, VFEAT_DIM, max_frm_seq_len, skip_layers=opts.skip_layer_loading) else: # continue training model = HeroForPretraining.from_pretrained( opts.model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=opts.lw_neg_ctx, lw_neg_q=opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=opts.hard_pool_size, margin=opts.margin, use_all_neg=opts.use_all_neg, drop_svmr_prob=opts.drop_svmr_prob) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') restorer = TrainingRestorer(opts, model, optimizer) all_gather_list(None) # sync to prevent slower rank to read training meta global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) task2loss = { task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys() } for task in train_dataloaders.keys(): if task.startswith('vsm'): for obj in ('st_ed', 'neg_ctx', 'neg_q'): task2loss[f"{task}_{obj}"] = RunningMeter(f'loss/{task}_{obj}') model.train() n_examples = defaultdict(int) start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() if global_step == 0: optimizer.step() assert all(global_step == s for s in all_gather_list(global_step)) for step, (task, batch) in enumerate(meta_loader): LOGGER.debug(f"Task: {task}") # hard negative in VSM if len(opts.hard_negtiave_start_step) > 0: for i, hn_step in enumerate(opts.hard_negtiave_start_step): if global_step >= hn_step and hn_step != -1: model.set_hard_negative(True, opts.hard_pool_size[i], opts.hard_neg_weights[i]) # start-end loss if opts.train_span_start_step != -1 and\ global_step >= opts.train_span_start_step: model.set_train_st_ed(opts.lw_st_ed) train_task = task.split('_')[0] n_examples[task] += opts.train_batch_size loss = model(batch, task=train_task, compute_loss=True) if train_task == 'vsm': loss_st_ed, loss_neg_ctx, loss_neg_q = loss loss = loss_st_ed + loss_neg_ctx + loss_neg_q for n, ls, w in (('st_ed', loss_st_ed, opts.lw_st_ed), ('neg_ctx', loss_neg_ctx, opts.lw_neg_ctx), ('neg_q', loss_neg_q, opts.lw_neg_q)): ls = ls.item() if w: ls /= w task2loss[f'{task}_{n}'](ls) elif train_task == "mffr": loss = torch.sqrt(loss.sum(dim=1)) loss = loss.mean() task2loss[task](loss.item()) delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[task]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] LOGGER.debug("before reduce grad") all_reduce_and_rescale_tensors(grads, float(1)) LOGGER.debug("after reduce grad") if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: only consider rank 0 for speed TB_LOGGER.log_scaler_dict({ ll.name: ll.val for ll in task2loss.values() if ll.val is not None }) TB_LOGGER.step() LOGGER.debug("before norm grad") # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) LOGGER.debug("after norm grad") LOGGER.debug("before optim step") optimizer.step() optimizer.zero_grad() pbar.update(1) LOGGER.debug("after optim step") if global_step % 100 == 0: LOGGER.debug("after gather stats") # monitor training throughput LOGGER.info('-------------------------------------------') LOGGER.info(f'Step {global_step}:') for t in train_dataloaders.keys(): tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) LOGGER.debug("after gather stats") if global_step % opts.valid_steps == 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, opts) LOGGER.info('===========================================') model_saver.save(model, global_step) # step restorer in the end to prevent missing validation checkpoint restorer.step() if global_step >= opts.num_train_steps: break LOGGER.info('===========================================') if global_step % opts.valid_steps != 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, opts) LOGGER.info('===========================================') model_saver.save(model, global_step)
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() all_dbs = [db for datasets in [opts.train_datasets, opts.val_datasets] for dset in datasets for db in dset['db']] tokenizer = json.load(open(f'{all_dbs[0]}/meta.json'))['bert'] #print(tokenizer) # assert all(tokenizer == json.load(open(f'{db}/meta.json'))['bert'] # for db in all_dbs) # build data loaders train_dataloaders, all_img_dbs = create_dataloaders( opts.train_datasets, True, opts) val_dataloaders, _ = create_dataloaders( opts.val_datasets, False, opts, all_img_dbs) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} if opts.rename_checkpoints: rename_checkpoint(checkpoint) #Include early_adaptation if opts.early_adaptation: early_adaptation_checkpoint = torch.load(opts.early_adaptation_checkpoint) checkpoint['roberta.img_embeddings.img_linear.weight'] = early_adaptation_checkpoint['v2w_linear.weight'] checkpoint['roberta.img_embeddings.img_linear.bias'] = early_adaptation_checkpoint['v2w_linear.bias'] model = VLXLMRForPretraining.from_pretrained( opts.model_config, checkpoint, img_dim=IMG_DIM, img_label_dim=IMG_LABEL_DIM, nce_temp=opts.nce_temp, ot_pos_only=opts.ot_pos_only) # model = UniterForPretraining.from_pretrained( # opts.model_config, checkpoint, # img_dim=IMG_DIM, img_label_dim=IMG_LABEL_DIM, # nce_temp=opts.nce_temp, ot_pos_only=opts.ot_pos_only) model.pad_vocab() # tensor core padding for vocabulary model.to(device) model.train() # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') #global_step = 0 #Initialize the TrainingRestorer restorer = TrainingRestorer(opts, model, optimizer) global_step = restorer.global_step TB_LOGGER._global_step = global_step if hvd.rank() !=0: restorer = NoOp() #Added for Restoring the Checkpoints if global_step > 0: pbar.update(global_step) LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) # to compute training statistics task2loss = {task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys()} # ITM w/ OT if opts.itm_ot_lambda > 0: for task in train_dataloaders.keys(): if task.startswith('itm'): task2loss[f'{task}_xe'] = RunningMeter(f'loss/{task}_xe') task2loss[f'{task}_ot'] = RunningMeter(f'loss/{task}_ot') if not opts.ot_pos_only: task2loss[f'{task}_ot_pos'] = RunningMeter( f'loss/{task}_ot_pos') task2loss[f'{task}_ot_neg'] = RunningMeter( f'loss/{task}_ot_neg') n_examples = defaultdict(int) n_in_units = defaultdict(int) n_loss_units = defaultdict(int) n_neg_nce = defaultdict(int) grad_norm = 0 start = time() #Added by Mingyang to debug the training procedure # debug_start = torch.cuda.Event(enable_timing=True) # debug_end = torch.cuda.Event(enable_timing=True) # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() #Added by Mingyang Zhou # debug_start.record() for step, (name, batch) in enumerate(meta_loader): # forward pass assert all(name == n for n in all_gather_list(name)) n_examples[name] += batch['input_ids'].size(0) n_in_units[name] += (batch['attn_masks'] == 1).sum().item() if 'nce' in name: n_neg_nce[name] += batch['neg_feats'].size(0) task = name.split('_')[0] loss = model(batch, task=task, compute_loss=True) if task.startswith('itm'): # OT itm_loss, ot_loss = loss n_loss_units[name] += itm_loss.size(0) itm_loss = itm_loss.mean() if ot_loss is not None: if not opts.ot_pos_only: ot_pos, ot_neg = ot_loss ot_loss = (ot_pos.sum() - ot_neg.sum() ) / (ot_pos.size(0) + ot_neg.size(0)) # NOTE: be ware of empty tensor ot_pos = ot_pos.mean().item() if not math.isnan(ot_pos): task2loss[f'{name}_ot_pos'](ot_pos) ot_neg = ot_neg.mean().item() if not math.isnan(ot_neg): task2loss[f'{name}_ot_neg'](ot_neg) else: ot_loss = ot_loss.mean() loss = itm_loss + opts.itm_ot_lambda * ot_loss task2loss[f'{name}_xe'](itm_loss.item()) task2loss[f'{name}_ot'](ot_loss.item()) else: loss = itm_loss elif task.startswith('vmlm-soft'): loss = 1000*loss.mean() else: n_loss_units[name] += loss.size(0) loss = loss.mean() # loss is not normalized in model # backward pass delay_unscale = (step+1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[name]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None] all_reduce_and_rescale_tensors(grads, float(1)) task2loss[name](loss.item()) # optimizer update and logging if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # for t, l in task2loss.items(): # loss = sum(v for v in all_gather_list(l.val) # if v is not None) / hvd.size() # task2loss[t] = RunningMeter(f'loss/{t}', loss) TB_LOGGER.log_scaler_dict({l.name: l.val for l in task2loss.values() if l.val is not None}) TB_LOGGER.step() # update model params if opts.grad_norm != -1: ''' if global_step % 10 == 0 and not opts.fp16: bias = model.bert.img_embeddings.img_linear.bias weight = model.bert.img_embeddings.img_linear.weight print(f"bnorm: {bias.norm()}") print(f"wnorm: {weight.norm()}") print(f"bgnorm: {bias.grad.norm()}") print(f"wgnorm: {weight.grad.norm()}") mask = model.bert.img_embeddings.mask_embedding.weight print(f"mnorm: {mask.norm()}") print(f"mgnorm: {mask.grad.norm()}") print([(n, p.grad.norm().item()) for n, p in model.named_parameters() if p.grad is not None and p.grad.norm().item() > grad_norm/10]) ''' grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info(f'==============Step {global_step}===============') for t in train_dataloaders.keys(): assert all(tt == t for tt in all_gather_list(t)) tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time()-start)) tot_in = sum(all_gather_list(n_in_units[t])) in_per_sec = int(tot_in / (time()-start)) tot_l = sum(all_gather_list(n_loss_units[t])) l_per_sec = int(tot_l / (time()-start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) TB_LOGGER.add_scalar(f'perf/{t}_in_per_s', in_per_sec, global_step) TB_LOGGER.add_scalar(f'perf/{t}_loss_per_s', l_per_sec, global_step) if 'nce' in t: avg_neg = sum(all_gather_list(n_neg_nce[t]) ) / hvd.size() // step LOGGER.info(f'{t}: averaging ' f'{avg_neg} negative samples') LOGGER.info(f'===============================================') if global_step % opts.valid_steps == 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_dataloaders) #os.makedir('/'.join([opts.output_dir, "ckpt") model_saver.save(model, global_step, optimizer) restorer.step() if global_step >= opts.num_train_steps: break if global_step % opts.valid_steps != 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_dataloaders) model_saver.save(model, global_step)
def build_downstream_dataloaders(tasks, video_db, video_ids, is_train, opts, q_txt_db=None, shuffle=False): dataloaders = {} assert q_txt_db is not None for i, task in enumerate(tasks): if is_train: LOGGER.info(f"Loading {task} train dataset " f"{video_db.img_db.img_dir}") batch_size = opts.train_batch_size else: batch_size = opts.val_batch_size LOGGER.info(f"Loading {task} validation dataset" f"{video_db.img_db.img_dir}") if task in ["tvqa", "how2qa"]: if is_train: dataset = VideoQaDataset(video_ids, video_db, q_txt_db) collate_fn = video_qa_collate else: dataset = VideoQaEvalDataset(video_ids, video_db, q_txt_db) collate_fn = video_qa_eval_collate elif task in ["tvr", "how2r", "didemo_video_sub"]: if is_train: dataset = VcmrDataset(video_ids, video_db, q_txt_db) collate_fn = vcmr_collate else: dataset = VcmrEvalDataset(video_ids, video_db, q_txt_db) collate_fn = vcmr_eval_collate elif task == "didemo_video_only": if is_train: dataset = VcmrVideoOnlyDataset(video_ids, video_db, q_txt_db) collate_fn = vcmr_collate else: dataset = VcmrVideoOnlyEvalDataset(video_ids, video_db, q_txt_db) collate_fn = vcmr_eval_collate elif task == "msrvtt_video_only": if is_train: dataset = VrVideoOnlyDataset(video_ids, video_db, q_txt_db) collate_fn = vr_collate else: dataset = VrVideoOnlyEvalDataset(video_ids, video_db, q_txt_db) collate_fn = vr_eval_collate elif task == "msrvtt_video_sub": if is_train: dataset = VrDataset(video_ids, video_db, q_txt_db) collate_fn = vr_collate else: dataset = VrEvalDataset(video_ids, video_db, q_txt_db) collate_fn = vr_eval_collate elif task == "violin": if is_train: dataset = ViolinDataset(video_ids, video_db, q_txt_db) collate_fn = violin_collate else: dataset = ViolinEvalDataset(video_ids, video_db, q_txt_db) collate_fn = violin_eval_collate else: raise ValueError(f'Undefined task {task}') LOGGER.info(f"{sum(all_gather_list(len(dataset)))} samples loaded") loader = DataLoader(dataset, batch_size=batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=collate_fn, shuffle=shuffle) if is_train: ratio = 1 dataloaders[task] = (loader, ratio) else: dataloaders[task] = PrefetchLoader(loader) return dataloaders
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, 16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(load_json(hps_file)) model_config = f'{opts.output_dir}/log/model_config.json' # load DBs and image dirs video_ids = get_video_ids(opts.query_txt_db) if opts.task != "didemo_video_only": video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, model_opts.vfeat_interval, model_opts) else: txt_meta = load_json(os.path.join(opts.query_txt_db, "meta.json")) video_db = load_video_only_dataset(opts.vfeat_db, txt_meta, model_opts.vfeat_interval, model_opts) assert opts.split in opts.query_txt_db q_txt_db = QueryTokLmdb(opts.query_txt_db, -1) if opts.task != "didemo_video_only": inf_dataset = VcmrFullEvalDataset else: inf_dataset = VcmrVideoOnlyFullEvalDataset eval_dataset = inf_dataset(video_ids, video_db, q_txt_db, distributed=model_opts.distributed_eval) # Prepare model if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) img_pos_embed_weight_key = ("v_encoder.f_encoder.img_embeddings.position_embeddings.weight") assert img_pos_embed_weight_key in checkpoint max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) model = HeroForVcmr.from_pretrained( model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=model_opts.lw_neg_ctx, lw_neg_q=model_opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=model_opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=model_opts.hard_pool_size, margin=model_opts.margin, use_all_neg=model_opts.use_all_neg, drop_svmr_prob=model_opts.drop_svmr_prob) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) _, results = validate_full_vcmr(model, eval_dataloader, opts.split, opts, model_opts) result_dir = f'{opts.output_dir}/results_{opts.split}' if not exists(result_dir) and rank == 0: os.makedirs(result_dir) all_results_list = all_gather_list(results) if hvd.rank() == 0: # save for only one time all_results = {"video2idx": all_results_list[0]["video2idx"]} for rank_id in range(hvd.size()): for key, val in all_results_list[rank_id].items(): if key == "video2idx": continue if key not in all_results: all_results[key] = [] all_results[key].extend(all_results_list[rank_id][key]) LOGGER.info('All results joined......') # save_vr(all_results, f'{result_dir}/results_{opts.checkpoint}_{opts.split}_vr.json') # save_vcmr_base_on_vr(all_results, f'{result_dir}/results_{opts.checkpoint}_{opts.split}_vcmr_base_on_vr.json') save_vcmr(all_results, f'{result_dir}/results_{opts.checkpoint}_{opts.split}_vcmr.json')
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) opts.n_gpu = n_gpu LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True set_random_seed(opts.seed) # train_examples = None LOGGER.info(f"Loading the whole video dataset {opts.sub_txt_db}, " f"{opts.vfeat_db}") if opts.task != "didemo_video_only": video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, opts.vfeat_interval, opts) else: txt_meta = load_json(join(opts.train_query_txt_db, "meta.json")) video_db = load_video_only_dataset(opts.vfeat_db, txt_meta, opts.vfeat_interval, opts) # data loaders # train video_ids = get_video_ids(opts.train_query_txt_db) train_q_txt_db = QueryTokLmdb(opts.train_query_txt_db, opts.max_txt_len) train_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, True, opts, shuffle=True, q_txt_db=train_q_txt_db) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # val video_ids = get_video_ids(opts.val_query_txt_db) val_q_txt_db = QueryTokLmdb(opts.val_query_txt_db, -1) val_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, False, opts, q_txt_db=val_q_txt_db) if opts.task != "didemo_video_only": inf_dataset = VcmrFullEvalDataset else: inf_dataset = VcmrVideoOnlyFullEvalDataset LOGGER.info(f"Loading Inference Dataset {opts.val_query_txt_db} (val)") val_dset = inf_dataset(video_ids, video_db, val_q_txt_db, distributed=opts.distributed_eval) inf_loader_val = DataLoader(val_dset, batch_size=opts.vcmr_eval_q_batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) inf_loader_val = PrefetchLoader(inf_loader_val) if opts.test_query_txt_db: LOGGER.info( f"Loading Inference Dataset {opts.test_query_txt_db} (test)") video_ids = get_video_ids(opts.test_query_txt_db) test_q_txt_db = QueryTokLmdb(opts.test_query_txt_db, -1) test_dset = inf_dataset(video_ids, video_db, test_q_txt_db, distributed=opts.distributed_eval) inf_loader_test = DataLoader(test_dset, batch_size=opts.vcmr_eval_q_batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) inf_loader_test = PrefetchLoader(inf_loader_test) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" if img_pos_embed_weight_key in checkpoint: max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) else: max_frm_seq_len = MAX_FRM_SEQ_LEN model = HeroForVcmr.from_pretrained( opts.model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=opts.lw_neg_ctx, lw_neg_q=opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=opts.hard_pool_size, margin=opts.margin, use_all_neg=opts.use_all_neg, drop_svmr_prob=opts.drop_svmr_prob) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') restorer = TrainingRestorer(opts, model, optimizer) global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) if not exists(join(opts.output_dir, 'results')): # store tvr predictions os.makedirs(join(opts.output_dir, 'results')) if opts.nms_thd != -1: # store tvr-nms predictions if not exists(join(opts.output_dir, 'results_nms')): os.makedirs(join(opts.output_dir, 'results_nms')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) task2loss = { task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys() } for obj in (f'{opts.task}_st_ed', f'{opts.task}_neg_ctx', f'{opts.task}_neg_q'): task2loss[obj] = RunningMeter(f'loss/{obj}') model.train() n_examples = defaultdict(int) start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() if global_step == 0: optimizer.step() for step, (task, batch) in enumerate(meta_loader): if len(opts.hard_negtiave_start_step) > 0: for i, hn_step in enumerate(opts.hard_negtiave_start_step): if global_step >= hn_step and hn_step != -1: model.set_hard_negative(True, opts.hard_pool_size[i], opts.hard_neg_weights[i]) if opts.train_span_start_step != -1 and\ global_step >= opts.train_span_start_step: model.set_train_st_ed(opts.lw_st_ed) n_examples[task] += opts.train_batch_size loss = model(batch, task=task, compute_loss=True) loss_st_ed, loss_neg_ctx, loss_neg_q = loss loss = loss_st_ed + loss_neg_ctx + loss_neg_q for n, ls, w in (('st_ed', loss_st_ed, opts.lw_st_ed), ('neg_ctx', loss_neg_ctx, opts.lw_neg_ctx), ('neg_q', loss_neg_q, opts.lw_neg_q)): ls = ls.item() if w: ls /= w task2loss[f'{task}_{n}'](ls) loss = loss.mean() task2loss[task](loss.item()) delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[task]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss TB_LOGGER.log_scaler_dict({ temp_loss.name: temp_loss.val for temp_loss in task2loss.values() if temp_loss.val is not None }) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info('-------------------------------------------') LOGGER.info(f'Step {global_step}:') for t in train_dataloaders.keys(): tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) if global_step % opts.valid_steps == 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, opts) if hvd.rank() == 0 or opts.distributed_eval: log, results = validate_full_vcmr(model, inf_loader_val, 'val', opts, model_opts=opts) save_json( results, f'{opts.output_dir}/results/' f'val_results_{global_step}_rank{hvd.rank()}.json') TB_LOGGER.log_scaler_dict(log) if opts.test_query_txt_db: log, results = validate_full_vcmr(model, inf_loader_test, 'test', opts, model_opts=opts) save_json( results, f'{opts.output_dir}/results/' f'test_results_{global_step}_rank{hvd.rank()}.json' ) TB_LOGGER.log_scaler_dict(log) LOGGER.info('===========================================') model_saver.save(model, global_step) # step restorer in the end to prevent missing validation checkpoint restorer.step() if global_step >= opts.num_train_steps: break LOGGER.info('===========================================') if global_step % opts.valid_steps != 0: if hvd.rank() == 0 or opts.distributed_eval: log, results = validate_full_vcmr(model, inf_loader_val, 'val', opts, model_opts=opts) save_json( results, f'{opts.output_dir}/results/' f'val_results_{global_step}' f'_rank{hvd.rank()}_final.json') TB_LOGGER.log_scaler_dict(log) if opts.test_query_txt_db: log, results = validate_full_vcmr(model, inf_loader_test, 'test', opts, model_opts=opts) save_json( results, f'{opts.output_dir}/results/' f'test_results_{global_step}_rank{hvd.rank()}.json') TB_LOGGER.log_scaler_dict(log) model_saver.save(model, f'{global_step}_final')
def create_dataloaders(datasets, is_train, opts, all_img_dbs=None): if all_img_dbs is None: all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) dataloaders = {} for dset in datasets: for vcr_task in ["qa", "qar"]: if is_train: assert len(dset['db']) == len(dset['img']) assert len(dset['tasks']) == len(dset['mix_ratio']) img_db, img_db_gt = [], [] for img_path in dset['img']: curr_img_db, curr_img_db_gt = load_img_feat( img_path, all_img_dbs, opts) img_db.append(curr_img_db) img_db_gt.append(curr_img_db_gt) else: assert len(dset['db']) == len(dset['img']) == 1 img_db, img_db_gt = load_img_feat(dset['img'][0], all_img_dbs, opts) for i, t in enumerate(dset['tasks']): task = f'{t}_{dset["name"]}' if is_train: LOGGER.info( f"Loading {task} train dataset with vcr_{vcr_task}, " f"{dset['db']}, {[img.img_dir for img in img_db]}," f"{[img.img_dir for img in img_db_gt]}") txt_db = [ VcrTxtTokLmdb(path, opts.max_txt_len, task=vcr_task) for path in dset['db'] ] else: LOGGER.info( f"Loading {task} val dataset with vcr_{vcr_task}, " f"{dset['db']}, {img_db.img_dir}," f"{img_db_gt.img_dir}") txt_db = VcrTxtTokLmdb(dset['db'][0], -1, task=vcr_task) if task.startswith('mlm'): dataset = build_mlm_dataset(txt_db, img_db_gt, img_db, is_train, opts) elif task.startswith('mrfr'): dataset = build_mrfr_dataset(txt_db, img_db_gt, img_db, is_train, opts) elif task.startswith('mrc'): dataset = build_mrc_dataset(txt_db, img_db_gt, img_db, is_train, opts) else: raise ValueError(f'Undefined task {task}') LOGGER.info(f"{len(dataset[0])*hvd.size()} samples loaded") loader = build_dataloader(*dataset, is_train, opts) if is_train: ratio = dset['mix_ratio'][i] dataloaders[task] = (loader, ratio) else: dataloaders[task] = PrefetchLoader(loader) return dataloaders, all_img_dbs
def build_target_loaders(target, tgt_ratio, opts): if 'vfeat_shards' in target: sub_txt_db = SubTokLmdb(f"{opts.txt_db}/{target['sub_txt_db']}", opts.max_clip_len) video_db = [ load_video_sub_dataset(f"{target['vfeat_db']}/{shard}", sub_txt_db, target['vfeat_interval'], opts) for shard in target['vfeat_shards'] ] else: # video_db -> data/data.py No.392 video_db = load_video_sub_dataset( f"{opts.img_db}/{target['vfeat_db']}", # /video/tv/ f"{opts.txt_db}/{target['sub_txt_db']}", # /txt/tv_subtitles.db target['vfeat_interval'], opts) # 1.5, opts train_loaders = {} val_loaders = {} for split in target['splits']: if 'ratio' not in split: split['ratio'] = [1] * len(split['tasks']) assert len(split['tasks']) == len(split['ratio']) for task, r in zip(split['tasks'], split['ratio']): name = f"{task}_{target['name']}_{split['name']}" LOGGER.info(f'loading {name} ...') ratio = tgt_ratio * r if isinstance(video_db, list): all_train_ids = [ json.load(open(f"{opts.txt_db}/{ids}")) for ids in split['train_idx'] ] else: train_ids = json.load( open(f"{opts.txt_db}/{split['train_idx']}")) val_ids = json.load(open(f"{opts.txt_db}/{split['val_idx']}")) if task == 'mlm': if isinstance(video_db, list): train_dset = ConcatDataset([ VideoMlmDataset(ids, vid_db, opts.mask_prob, sub_ctx_len=opts.sub_ctx_len) for ids, vid_db in zip(all_train_ids, video_db) ]) val_dset = VideoMlmDataset(val_ids, video_db[0], opts.mask_prob, sub_ctx_len=opts.sub_ctx_len) else: train_dset = VideoMlmDataset(train_ids, video_db, opts.mask_prob, sub_ctx_len=opts.sub_ctx_len) val_dset = VideoMlmDataset(val_ids, video_db, opts.mask_prob, sub_ctx_len=opts.sub_ctx_len) train_collate = mlm_collate val_collate = mlm_collate elif task == 'mfm-nce' or task == 'mffr': if isinstance(video_db, list): train_dset = ConcatDataset([ MfmDataset(ids, vid_db, opts.mask_prob) for ids, vid_db in zip(all_train_ids, video_db) ]) val_dset = MfmDataset(val_ids, video_db[0], opts.mask_prob) else: train_dset = MfmDataset(train_ids, video_db, opts.mask_prob) val_dset = MfmDataset(val_ids, video_db, opts.mask_prob) train_collate = mfm_collate val_collate = mfm_collate elif task == 'fom': if isinstance(video_db, list): train_dset = ConcatDataset([ FomDataset(ids, vid_db, opts.mask_prob) for ids, vid_db in zip(all_train_ids, video_db) ]) val_dset = FomEvalDataset(val_ids, video_db[0], opts.mask_prob) else: train_dset = FomDataset(train_ids, video_db, opts.mask_prob) val_dset = FomEvalDataset(val_ids, video_db, opts.mask_prob) train_collate = fom_collate val_collate = fom_eval_collate elif task == 'vsm': if isinstance(video_db, list): train_dset = ConcatDataset([ VsmDataset(ids, vid_db, sub_ctx_len=opts.sub_ctx_len) for ids, vid_db in zip(all_train_ids, video_db) ]) val_dset = VsmDataset(val_ids, video_db[0], sub_ctx_len=opts.sub_ctx_len) else: train_dset = VsmDataset(train_ids, video_db, sub_ctx_len=opts.sub_ctx_len) val_dset = VsmDataset(val_ids, video_db, sub_ctx_len=opts.sub_ctx_len) train_collate = vsm_collate val_collate = vsm_collate else: raise ValueError(f'undefined task {task}') train_loader = DataLoader(train_dset, batch_size=opts.train_batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=train_collate, shuffle=True) val_loader = DataLoader(val_dset, batch_size=opts.val_batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=val_collate, shuffle=False) train_loaders[name] = (train_loader, ratio) val_loaders[name] = PrefetchLoader(val_loader) return train_loaders, val_loaders
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(args.output_dir, 'ckpt')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() all_dbs = [db for datasets in [opts.train_datasets, opts.val_datasets] for dset in datasets for db in dset['db']] tokenizer = json.load(open(f'{all_dbs[0]}/meta.json'))['bert'] assert all(tokenizer == json.load(open(f'{db}/meta.json'))['bert'] for db in all_dbs) # build data loaders train_dataloaders, all_img_dbs = create_dataloaders( opts.train_datasets, True, opts) val_dataloaders, _ = create_dataloaders( opts.val_datasets, False, opts, all_img_dbs) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = UniterForPretraining.from_pretrained( opts.model_config, checkpoint, img_dim=IMG_DIM, img_label_dim=IMG_LABEL_DIM) model.to(device) model.train() # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') global_step = 0 LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) # to compute training statistics task2loss = {task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys()} # ITM w/ OT if opts.itm_ot_lambda > 0: for task in train_dataloaders.keys(): if task.startswith('itm'): task2loss[f'{task}_xe'] = RunningMeter(f'loss/{task}_xe') task2loss[f'{task}_ot'] = RunningMeter(f'loss/{task}_ot') task2loss[f'{task}_ot_pos'] = RunningMeter( f'loss/{task}_ot_pos') task2loss[f'{task}_ot_neg'] = RunningMeter( f'loss/{task}_ot_neg') n_examples = defaultdict(int) n_in_units = defaultdict(int) n_loss_units = defaultdict(int) grad_norm = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() for step, (name, batch) in enumerate(meta_loader): # forward pass n_examples[name] += batch['input_ids'].size(0) n_in_units[name] += (batch['attn_masks'] == 1).sum().item() task = name.split('_')[0] loss = model(batch, task=task, compute_loss=True) if task.startswith('itm'): # OT itm_loss, ot_loss = loss n_loss_units[name] += itm_loss.size(0) itm_loss = itm_loss.mean() if ot_loss is not None: ot_pos, ot_neg = ot_loss ot_loss = (ot_pos.sum() - ot_neg.sum() ) / (ot_pos.size(0) + ot_neg.size(0)) # NOTE: be ware of empty tensor ot_pos = ot_pos.mean().item() if not math.isnan(ot_pos): task2loss[f'{name}_ot_pos'](ot_pos) ot_neg = ot_neg.mean().item() if not math.isnan(ot_neg): task2loss[f'{name}_ot_neg'](ot_neg) loss = itm_loss + opts.itm_ot_lambda * ot_loss task2loss[f'{name}_xe'](itm_loss.item()) task2loss[f'{name}_ot'](ot_loss.item()) else: loss = itm_loss else: n_loss_units[name] += loss.size(0) loss = loss.mean() # loss is not normalized in model # backward pass delay_unscale = (step+1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[name]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None] all_reduce_and_rescale_tensors(grads, float(1)) task2loss[name](loss.item()) # optimizer update and logging if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.log_scaler_dict({ll.name: ll.val for ll in task2loss.values() if ll.val is not None}) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info(f'==============Step {global_step}===============') for t in train_dataloaders.keys(): assert all(tt == t for tt in all_gather_list(t)) tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time()-start)) tot_in = sum(all_gather_list(n_in_units[t])) in_per_sec = int(tot_in / (time()-start)) tot_l = sum(all_gather_list(n_loss_units[t])) l_per_sec = int(tot_l / (time()-start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) TB_LOGGER.add_scalar(f'perf/{t}_in_per_s', in_per_sec, global_step) TB_LOGGER.add_scalar(f'perf/{t}_loss_per_s', l_per_sec, global_step) LOGGER.info('===============================================') if global_step % opts.valid_steps == 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_dataloaders) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step % opts.valid_steps != 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_dataloaders) model_saver.save(model, global_step)
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), opts.fp16)) if rank != 0: LOGGER.disabled = True hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(json.load(open(hps_file))) assert opts.split in opts.img_db and opts.split in opts.txt_db # load DBs and image dirs eval_img_db, eval_img_db_gt = load_img_feat(opts.img_db, model_opts) eval_txt_db = VcrTxtTokLmdb(opts.txt_db, -1) eval_dataset = VcrEvalDataset( "test", eval_txt_db, img_db=eval_img_db, img_db_gt=eval_img_db_gt) # Prepare model model = UniterForVisualCommonsenseReasoning.from_pretrained( f'{opts.output_dir}/log/model.json', state_dict={}, img_dim=IMG_DIM) model.init_type_embedding() model.init_word_embedding(NUM_SPECIAL_TOKENS) if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) state_dict = checkpoint.get('model_state', checkpoint) matched_state_dict = {} unexpected_keys = set() missing_keys = set() for name, param in model.named_parameters(): missing_keys.add(name) for key, data in state_dict.items(): if key in missing_keys: matched_state_dict[key] = data missing_keys.remove(key) else: unexpected_keys.add(key) LOGGER.info(f"Unexpected_keys: {list(unexpected_keys)}") LOGGER.info(f"Missing_keys: {list(missing_keys)}") model.load_state_dict(matched_state_dict, strict=False) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=True, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, shuffle=False, collate_fn=vcr_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) _, results = evaluate(model, eval_dataloader) result_dir = f'{opts.output_dir}/results_{opts.split}' if not exists(result_dir) and rank == 0: os.makedirs(result_dir) all_results = {} for id2res in all_gather_list(results): all_results.update(id2res) if hvd.rank() == 0: with open(f'{result_dir}/' f'results_{opts.checkpoint}_all.json', 'w') as f: json.dump(all_results, f) probs_df = save_for_submission( f'{result_dir}/results_{opts.checkpoint}_all.json') probs_df.to_csv(f'{result_dir}/results_{opts.checkpoint}_all.csv')
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) hps_file = f"{opts.output_dir}/log/hps.json" model_opts = Struct(json.load(open(hps_file))) # train_examples = None ans2label_file = f"{opts.output_dir}/ckpt/ans2label.json" ans2label = json.load(open(ans2label_file)) label2ans = {label: ans for ans, label in ans2label.items()} # load DBs and image dirs eval_img_db = DetectFeatLmdb( opts.img_db, model_opts.conf_th, model_opts.max_bb, model_opts.min_bb, model_opts.num_bb, opts.compressed_db, ) eval_txt_db = TxtTokLmdb(opts.txt_db, -1) eval_dataset = VqaEvalDataset(len(ans2label), eval_txt_db, eval_img_db) # Prepare model if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f"{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt" checkpoint = torch.load(ckpt_file) model = UniterForVisualQuestionAnswering.from_pretrained( f"{opts.output_dir}/log/model.json", checkpoint, img_dim=IMG_DIM, num_answer=len(ans2label), ) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=True, opt_level="O2") sampler = TokenBucketSampler( eval_dataset.lens, bucket_size=BUCKET_SIZE, batch_size=opts.batch_size, droplast=False, ) eval_dataloader = DataLoader( eval_dataset, batch_sampler=sampler, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vqa_eval_collate, ) eval_dataloader = PrefetchLoader(eval_dataloader) val_log, results, logits = evaluate(model, eval_dataloader, label2ans, opts.save_logits) result_dir = f"{opts.output_dir}/results_test" if not exists(result_dir) and rank == 0: os.makedirs(result_dir) all_results = list(concat(all_gather_list(results))) if opts.save_logits: all_logits = {} for id2logit in all_gather_list(logits): all_logits.update(id2logit) if hvd.rank() == 0: with open(f"{result_dir}/" f"results_{opts.checkpoint}_all.json", "w") as f: json.dump(all_results, f) if opts.save_logits: np.savez(f"{result_dir}/logits_{opts.checkpoint}_all.npz", **all_logits)
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if rank != 0: LOGGER.disabled = True hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(json.load(open(hps_file))) assert opts.split in opts.img_db and opts.split in opts.txt_db # load DBs and image dirs eval_img_db, eval_img_db_gt = load_img_feat(opts.img_db, model_opts) eval_txt_db = VcrTxtTokLmdb(opts.txt_db, -1) eval_dataset = VcrEvalDataset("val", eval_txt_db, img_db=eval_img_db, img_db_gt=eval_img_db_gt) # Prepare model model = UniterForVisualCommonsenseReasoning.from_pretrained( f'{opts.output_dir}/log/model.json', state_dict={}, img_dim=IMG_DIM) model.init_type_embedding() model.init_type_embedding_know() model.init_word_embedding(NUM_SPECIAL_TOKENS) if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) state_dict = checkpoint.get('model_state', checkpoint) matched_state_dict = {} unexpected_keys = set() missing_keys = set() for name, param in model.named_parameters(): missing_keys.add(name) for key, data in state_dict.items(): if key in missing_keys: matched_state_dict[key] = data missing_keys.remove(key) else: unexpected_keys.add(key) LOGGER.info(f"Unexpected_keys: {list(unexpected_keys)}") LOGGER.info(f"Missing_keys: {list(missing_keys)}") model.load_state_dict(matched_state_dict, strict=False) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=True, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, shuffle=False, collate_fn=vcr_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) results = evaluate(model, eval_dataloader) output = '/src/vlkaf.json' before_json = "" for i, item in enumerate(results): jstring = json.dumps(item) before_json += jstring + '\n' f = open(output, "w") f.write(before_json) f.close() '''
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.train_config is not None: train_opts = Struct(json.load(open(opts.train_config))) opts.conf_th = train_opts.conf_th opts.max_bb = train_opts.max_bb opts.min_bb = train_opts.min_bb opts.num_bb = train_opts.num_bb # load DBs and image dirs eval_img_db = DetectFeatLmdb(opts.img_db, opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) eval_txt_db = TxtTokLmdb(opts.txt_db, -1) eval_dataset = ItmEvalDataset(eval_txt_db, eval_img_db, opts.batch_size) # Prepare model checkpoint = torch.load(opts.checkpoint) model = UniterForImageTextRetrieval.from_pretrained(opts.model_config, checkpoint, img_dim=IMG_DIM) if 'rank_output' not in checkpoint: model.init_output() # zero shot setting model.to(device) model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=1, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=itm_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) eval_log, results = evaluate(model, eval_dataloader) if hvd.rank() == 0: if not exists(opts.output_dir) and rank == 0: os.makedirs(opts.output_dir) with open(f'{opts.output_dir}/config.json', 'w') as f: json.dump(vars(opts), f) with open(f'{opts.output_dir}/results.bin', 'wb') as f: pickle.dump(results, f) with open(f'{opts.output_dir}/scores.json', 'w') as f: json.dump(eval_log, f) LOGGER.info(f'evaluation finished') LOGGER.info( f"======================== Results =========================\n" f"image retrieval R1: {eval_log['img_r1']*100:.2f},\n" f"image retrieval R5: {eval_log['img_r5']*100:.2f},\n" f"image retrieval R10: {eval_log['img_r10']*100:.2f}\n" f"text retrieval R1: {eval_log['txt_r1']*100:.2f},\n" f"text retrieval R5: {eval_log['txt_r5']*100:.2f},\n" f"text retrieval R10: {eval_log['txt_r10']*100:.2f}") LOGGER.info("========================================================")
def main(opts): os.makedirs(opts.output_dir) os.makedirs(join(opts.output_dir, 'ckpt')) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) # train train_dataset = MemeAIDataset(json_path='/home/data/meme_json/train.json', npz_folder='/home/data/faster_cnn_feature/', mode='train') train_loader = DataLoader(train_dataset, batch_size=opts.train_batch_size, shuffle=True, num_workers=opts.n_workers, collate_fn=collate_fn) train_loader = PrefetchLoader(train_loader) # val val_dataset = MemeAIDataset(json_path='/home/data/meme_json/dev.json', npz_folder='/home/data/faster_cnn_feature/', mode='val') val_loader = DataLoader(val_dataset, batch_size=opts.inf_minibatch_size, shuffle=False, num_workers=opts.n_workers, collate_fn=collate_fn) val_loader = PrefetchLoader(val_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = Meme.from_pretrained(opts.model_config, state_dict=checkpoint, img_dim=IMG_DIM) model.init_output() # pretrain ITM head is different from ranking head model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=opts.learning_rate) for epoch in range(opts.epoch): print('epoch {}/ {}'.format(epoch, opts.epoch)) pbar = tqdm(total=len(train_loader)) model.train() preds = None gt = None for step, batch in enumerate(train_loader): x = batch[0] x['input_ids'] = x['input_ids'].to(device) x['position_ids'] = x['position_ids'].to(device) x['img_feat'] = x['img_feat'].to(device) x['img_pos_feat'] = x['img_pos_feat'].to(device) x['attn_masks'] = x['attn_masks'].to(device) x['gather_index'] = x['gather_index'].to(device) y = batch[1].to(device) pred = model(x) if preds is None: preds = torch.sigmoid(pred) gt = y else: preds = torch.cat((preds, torch.sigmoid(pred)), dim=0) gt = torch.cat((gt, y), dim=0) loss = F.binary_cross_entropy(torch.sigmoid(pred), y) loss.backward() optimizer.step() optimizer.zero_grad() pbar.update(1) model.eval() with torch.no_grad(): preds = preds.detach().cpu().numpy().reshape(len(preds), ) gt = gt.cpu().numpy() roc = roc_auc_score(gt, preds) acc = accuracy_score(gt, np.around(preds)) train_log = {'train/roc': roc, 'train/acc': acc} val_log = validate(model, val_loader) LOGGER.info(train_log) LOGGER.info(val_log) model_saver.save(model, epoch) pbar.close()