def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info(f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}") # Load config config = BertConfig.from_json_file(args.config_file) # Load task config with open(args.tasks_config_file, "r") as f: task_cfg = edict(yaml.safe_load(f)) task_id = args.task.strip() task = "TASK" + task_id task_name = task_cfg[task]["name"] base_lr = task_cfg[task]["lr"] if task_cfg[task].get("fusion_method", None): # VL-BERT pooling for VQA config.fusion_method = task_cfg[task]["fusion_method"] # Output dirs if args.save_name: prefix = "-" + args.save_name else: prefix = "" timestamp = (task_name + "_" + args.config_file.split("/")[1].split(".")[0] + prefix) save_path = os.path.join(args.output_dir, timestamp) if default_gpu: if not os.path.exists(save_path): os.makedirs(save_path) # save all the hidden parameters. with open(os.path.join(save_path, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Dataset batch_size, task2num_iters, dset_train, dset_val, dl_train, dl_val = LoadDataset(args, config, task_cfg, args.task) # Logging logdir = os.path.join(args.logdir, timestamp) tb_logger = tbLogger(logdir, save_path, [task_name], [task], task2num_iters, args.grad_acc_steps) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Model if "roberta" in args.bert_model: config.model = "roberta" model = BertForVLTasks.from_pretrained(args.from_pretrained, config=config, task_cfg=task_cfg, task_ids=[task]) if task_cfg[task].get("embed_clf", None): logger.info('Initializing classifier weight for %s from pretrained word embeddings...' % task) answers_word_embed = [] for k, v in model.state_dict().items(): if 'bert.embeddings.word_embeddings.weight' in k: word_embeddings = v.detach().clone() break for answer, label in sorted(dset_train.ans2label.items()): a_tokens = dset_train._tokenizer.tokenize(answer) a_ids = dset_train._tokenizer.convert_tokens_to_ids(a_tokens) if len(a_ids): a_word_embed = (torch.stack([word_embeddings[a_id] for a_id in a_ids], dim=0)).mean(dim=0) else: a_tokens = dset_train._tokenizer.tokenize("<unk>") a_id = dset_train._tokenizer.convert_tokens_to_ids(a_tokens)[0] a_word_embed = word_embeddings[a_id] answers_word_embed.append(a_word_embed) answers_word_embed_tensor = torch.stack(answers_word_embed, dim=0) for name, module in model.named_modules(): if name.endswith('clfs_dict.%s.logit_fc.3' % task): module.weight.data = answers_word_embed_tensor.to(device=module.weight.data.device) # Optimization details freeze_layers(model) criterion = LoadLoss(task_cfg, args.task) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if "vil_" in key: lr = 1e-4 else: lr = base_lr if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": 0.0}] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": args.weight_decay}] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) if args.optim == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr, eps=args.adam_epsilon, betas=args.adam_betas, correct_bias=args.adam_correct_bias) elif args.optim == "RAdam": optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr) num_train_optim_steps = (task2num_iters[task] * args.num_train_epochs // args.grad_acc_steps) warmup_steps = args.warmup_steps or args.warmup_proportion * num_train_optim_steps if args.lr_scheduler == "warmup_linear": scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optim_steps) else: scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmup_steps) # Resume training start_iter_id, global_step, start_epoch, tb_logger, max_score = \ resume(args.resume_file, model, optimizer, scheduler, tb_logger) # Move to GPU(s) model.to(device) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Save starting model save(save_path, logger, -1, model, optimizer, scheduler, global_step, tb_logger, default_gpu) # Print summary if default_gpu: summary_parameters(model, logger) print("***** Running training *****") print(" Num Iters: ", task2num_iters[task]) print(" Batch size: ", batch_size) print(" Num steps: %d" % num_train_optim_steps) # Train for epoch_id in tqdm(range(start_epoch, args.num_train_epochs), desc="Epoch"): model.train() for step, batch in enumerate(dl_train): iter_id = start_iter_id + step + (epoch_id * len(dl_train)) loss, score = ForwardModelsTrain(config, task_cfg, device, task, batch, model, criterion) if args.grad_acc_steps > 1: loss = loss / args.grad_acc_steps loss.backward() if (step + 1) % args.grad_acc_steps == 0: # Clip gradient if args.clip_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) optimizer.step() if global_step < warmup_steps or args.lr_scheduler == "warmup_linear": scheduler.step() model.zero_grad() global_step += 1 if default_gpu: tb_logger.step_train(epoch_id, iter_id, float(loss), float(score), optimizer.param_groups[0]["lr"], task, "train") if (step % (20 * args.grad_acc_steps) == 0) and step != 0 and default_gpu: tb_logger.showLossTrain() # Decide whether to evaluate task if iter_id != 0 and iter_id % task2num_iters[task] == 0: score = evaluate(config, dl_val, task_cfg, device, task, model, criterion, epoch_id, default_gpu, tb_logger) if score > max_score: max_score = score save(save_path, logger, epoch_id, model, optimizer, scheduler, global_step, tb_logger, default_gpu, max_score) save(save_path, logger, epoch_id, model, optimizer, scheduler, global_step, tb_logger, default_gpu, max_score) tb_logger.txt_close()
def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info(f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}") # Load config config = BertConfig.from_json_file(args.config_file) # Load task config with open(args.tasks_config_file, "r") as f: task_cfg = edict(yaml.safe_load(f)) task_id = args.task.strip() task = "TASK" + task_id task_name = task_cfg[task]["name"] if task_cfg[task].get("fusion_method", None): # VL-BERT pooling for VQA config.fusion_method = task_cfg[task]["fusion_method"] # Output dirs timeStamp = args.from_pretrained.split("/")[-1] + "-" + args.save_name savePath = os.path.join(args.output_dir, timeStamp) if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Dataset batch_size, task2num_iters, dset_val, dl_val = LoadDatasetEval(args, config, task_cfg, args.task) # Logging tb_logger = tbLogger(timeStamp, savePath, [task_name], [task], task2num_iters, 1, save_logger=False, txt_name="eval.txt") # Model if "roberta" in args.bert_model: config.model = "roberta" model = BertForVLTasks.from_pretrained(args.from_pretrained, config=config, task_cfg=task_cfg, task_ids=[task]) # Optimization details criterion = LoadLoss(task_cfg, args.task) # Move to GPU(s) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) # Print summary if default_gpu: print("***** Running evaluation *****") print(" Num Iters: ", task2num_iters[task]) print(" Batch size: ", batch_size) # Evaluate model.eval() results = [] others = [] for i, batch in tqdm(enumerate(dl_val), total=task2num_iters[task]): loss, score, batch_size, results, others = EvaluatingModel(config, task_cfg, device, task, batch, model, dl_val, criterion, results, others) tb_logger.step_val(0, float(loss), float(score), task, batch_size, "val") sys.stdout.write("%d/%d\r" % (i, len(dl_val))) sys.stdout.flush() # save the result or evaluate the result. ave_score = tb_logger.showLossVal(task) if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task]["val_split"]) json.dump(results, open(json_path + "_result.json", "w")) json.dump(others, open(json_path + "_others.json", "w"))
from volta.encoders import BertForVLPreTraining # Inputs parser = argparse.ArgumentParser() parser.add_argument("--input_fn", type=str, default="Epoch20_LXRT.pth") parser.add_argument("--output_fn", type=str, default="lxmert_checkpoint_19.bin") parser.add_argument("--verbose", action="store_true", default=False) args = parser.parse_args() # Load original checkpoint original_ckpt = torch.load(args.input_fn, map_location="cpu") # Create corresponding VOLTA model config_file = "../config/original_lxmert.json" config = BertConfig.from_json_file(config_file) model = BertForVLPreTraining.from_pretrained("bert-base-uncased", config=config, default_gpu=True, from_hf=True) trg_dict = model.state_dict() # Map original parameters onto VOLTA ones first_xlayer = config.tv_attn_sublayers[0] volta2original = dict() for k in original_ckpt.keys(): ln = k.replace('module.', '') ln = ln.replace("encoder.visn_fc", "v_embeddings") ln = ln.replace("visn_fc", "image_embeddings") ln = ln.replace("visn_layer_norm", "ImgLayerNorm") ln = ln.replace("box_fc", "image_location_embeddings") ln = ln.replace("box_layer_norm", "LocLayerNorm") ln = ln.replace('attention.self', 'attention_self')
def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info( f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}" ) # Load config config = BertConfig.from_json_file(args.config_file) # Load task config with open(args.tasks_config_file, "r") as f: task_cfg = edict(yaml.safe_load(f)) task_id = args.task.strip() task = "TASK" + task_id # Output dirs if "/" in args.from_pretrained: timeStamp = args.from_pretrained.split("/")[1] else: timeStamp = args.from_pretrained savePath = os.path.join(args.output_dir, timeStamp) if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Dataset batch_size, task2num_iters, dset_val, dl_val = LoadDatasetEval( args, config, task_cfg, args.task) max_subiter_images = dset_val.max_num_images # Model if args.zero_shot: config.visual_target_weights = {} model = BertForVLPreTraining.from_pretrained(args.from_pretrained, config=config) else: model = BertForVLTasks.from_pretrained(args.from_pretrained, config=config, task_cfg=task_cfg, task_ids=[task]) # Move to GPU(s) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, deay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) raise ValueError("Please run with a single GPU") # Print summary if default_gpu: print("***** Running evaluation *****") print(" Num Iters: ", task2num_iters) print(" Batch size: ", batch_size) # Evaluate model.eval() results = [] others = [] score_matrix = np.zeros( (args.num_images * args.captions_per_image, args.num_images)) target_matrix = np.zeros( (args.num_images * args.captions_per_image, args.num_images)) rank_vector = np.ones( args.num_images * args.captions_per_image) * args.num_images count = 0 for i, batch in tqdm(enumerate(dl_val), total=task2num_iters[task]): batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch) features, spatials, image_mask, question, input_mask, segment_ids, target, caption_idx, image_idx = batch features = features.squeeze(0) spatials = spatials.squeeze(0) image_mask = image_mask.squeeze(0) question = question.repeat(features.size(0), 1) segment_ids = segment_ids.repeat(features.size(0), 1) input_mask = input_mask.repeat(features.size(0), 1) with torch.no_grad(): if args.zero_shot: _, _, vil_logit, _, _, _ = model(question, features, spatials, segment_ids, input_mask, image_mask) score_matrix[caption_idx, image_idx * max_subiter_images:(image_idx + 1) * max_subiter_images] = (torch.softmax( vil_logit, dim=1)[:, 0].view(-1).cpu().numpy()) target_matrix[caption_idx, image_idx * max_subiter_images:(image_idx + 1) * max_subiter_images] = ( target.view(-1).float().cpu().numpy()) else: vil_logit, _, _, _ = model(question, features, spatials, task, segment_ids, input_mask, image_mask) score_matrix[caption_idx, image_idx * max_subiter_images:(image_idx + 1) * max_subiter_images] = ( vil_logit.view(-1).cpu().numpy()) target_matrix[caption_idx, image_idx * max_subiter_images:(image_idx + 1) * max_subiter_images] = ( target.view(-1).float().cpu().numpy()) if image_idx.item() == args.num_subiters - 1: rank = np.where( (np.argsort(-score_matrix[caption_idx]) == np.where( target_matrix[caption_idx] == 1)[0][0]) == 1)[0][0] rank_vector[caption_idx] = rank cur_rank_vector = rank_vector[:caption_idx + 1] r1 = 100.0 * np.sum(cur_rank_vector < 1) / len(cur_rank_vector) r5 = 100.0 * np.sum(cur_rank_vector < 5) / len(cur_rank_vector) r10 = 100.0 * np.sum( cur_rank_vector < 10) / len(cur_rank_vector) medr = np.floor(np.median(cur_rank_vector) + 1) meanr = np.mean(cur_rank_vector) + 1 print( "%d Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (count, r1, r5, r10, medr, meanr)) results.append( np.argsort(-score_matrix[caption_idx]).tolist()[:20]) count += 1 r1 = 100.0 * np.sum(rank_vector < 1) / len(rank_vector) r5 = 100.0 * np.sum(rank_vector < 5) / len(rank_vector) r10 = 100.0 * np.sum(rank_vector < 10) / len(rank_vector) medr = np.floor(np.median(rank_vector) + 1) meanr = np.mean(rank_vector) + 1 print("************************************************") print("****************Image Retrieval*****************") print("************************************************") print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (r1, r5, r10, medr, meanr)) print("************************************************") if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]["val_split"]) json.dump(results, open(json_path + "_result.json", "w")) json.dump(others, open(json_path + "_others.json", "w")) # Text Retrieval rank_vector = np.zeros(args.num_images) for image_idx in range(args.num_images): ranks = [] tgt_captions = np.where(target_matrix[:, image_idx] == 1)[0] sorted_scores = np.argsort(-score_matrix[:, image_idx]) for tgt_caption in tgt_captions: ranks.append(np.where((sorted_scores == tgt_caption) == 1)[0][0]) rank_vector[image_idx] = min(ranks) r1 = 100.0 * np.sum(rank_vector < 1) / len(rank_vector) r5 = 100.0 * np.sum(rank_vector < 5) / len(rank_vector) r10 = 100.0 * np.sum(rank_vector < 10) / len(rank_vector) medr = np.floor(np.median(rank_vector) + 1) meanr = np.mean(rank_vector) + 1 print("************************************************") print("****************Text Retrieval******************") print("************************************************") print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (r1, r5, r10, medr, meanr)) print("************************************************")
def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info( f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}" ) # Load config config = BertConfig.from_json_file(args.config_file) # Load task config with open(args.tasks_config_file, "r") as f: task_cfg = edict(yaml.safe_load(f)) task_id = args.task.strip() task = "TASK" + task_id task_name = task_cfg[task]["name"] if task_cfg[task].get("fusion_method", None): # VL-BERT pooling for VQA config.fusion_method = task_cfg[task]["fusion_method"] # Output dirs savePath = args.output_dir if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Dataset feats_h5path = task_cfg[task]["features_h5path1"] features_reader = ImageFeaturesH5Reader(feats_h5path, config, args.in_memory) batch_size = task_cfg[task]["batch_size"] num_workers = args.num_workers if args.local_rank != -1: batch_size = int(batch_size / dist.get_world_size()) num_workers = int(num_workers / dist.get_world_size()) logger.info("Loading %s Dataset with batch size %d" % (task_name, batch_size)) eval_split = args.split or task_cfg[task]["val_split"] tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) dset = FlickrVis4LangDataset( task, task_cfg[task]["dataroot"], args.masking, eval_split, features_reader, None, tokenizer, args.bert_model, max_seq_length=task_cfg[task]["max_seq_length"], max_region_num=task_cfg[task]["max_region_num"], num_locs=config.num_locs, threshold=args.overlap_threshold, add_global_imgfeat=config.add_global_imgfeat) dl = DataLoader(dset, shuffle=False, batch_size=batch_size, num_workers=num_workers, pin_memory=True) # Model config.visual_target_weights = {} model = BertForVLPreTraining.from_pretrained(args.from_pretrained, config=config) # Move to GPU(s) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) # Print summary if default_gpu: print("***** Running evaluation *****") print(" Num Iters: ", len(dl)) print(" Batch size: ", batch_size) # Evaluate model.eval() loss_fct = nn.CrossEntropyLoss(ignore_index=-1) phrase_ids, image_ids, pred_tokens, true_tokens, pred_scores, lm_losses = [], [], [], [], [], [] for batch in tqdm(dl, total=len(dl)): image_id = batch[-1] batch = batch[:-1] if device.type != 'cpu': batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) phrase_id, caption, input_mask, segment_ids, lm_label_ids, features, spatials, image_cls, \ obj_labels, obj_confs, attr_labels, attr_confs, image_attrs, image_mask, image_labels = batch with torch.no_grad(): predictions_t, _, _, _, _ = model(caption, features, spatials, token_type_ids=segment_ids, attention_mask=input_mask, image_attention_mask=image_mask, masked_lm_labels=None, image_label=None, image_cls=image_cls, obj_labels=obj_labels, obj_confs=obj_confs, attr_labels=attr_labels, attr_confs=attr_confs, image_attrs=image_attrs) # loss = masked_loss_t + masked_loss_v + pair_match_loss target_ixs = [[] for _ in range(predictions_t.size(0))] xs, ys = torch.where(lm_label_ids != -1) for x, y in zip(xs, ys): target_ixs[x].append(y.item()) for bix in range(predictions_t.size(0)): pred_bix_tokens, true_bix_tokens, bix_predictions = [], [], [] for masked_ix in target_ixs[bix]: predicted_index = torch.argmax( predictions_t[bix, masked_ix]).item() predicted_token = tokenizer.convert_ids_to_tokens( [predicted_index])[0] label_token = tokenizer.convert_ids_to_tokens( [lm_label_ids[bix, masked_ix].item()])[0] pred_bix_tokens.append(predicted_token) true_bix_tokens.append(label_token) bix_predictions.append(predictions_t[bix, masked_ix].numpy()) masked_lm_loss = loss_fct( predictions_t[bix].view(-1, config.vocab_size), lm_label_ids[bix].view(-1), ).unsqueeze(0).item() if args.dump_results: # pred_tokens.append(pred_bix_tokens) # true_tokens.append(true_bix_tokens) # pred_scores.append(bix_predictions) # image_ids.append(image_id[bix].item()) # phrase_ids.append(phrase_id[bix].item()) lm_losses.append(masked_lm_loss) if default_gpu: print("MLM:", np.mean(np.array(lm_losses))) if args.dump_results: eval_path = os.path.join(savePath, eval_split) masking_str = args.masking if args.masking != "ref" else args.masking + str( args.overlap_threshold) # cPickle.dump(pred_tokens, open(eval_path + "_%s_preds.pkl" % masking_str, "wb")) # cPickle.dump(true_tokens, open(eval_path + "_%s_truth.pkl" % masking_str, "wb")) # cPickle.dump(pred_scores, open(eval_path + "_%s_score.pkl" % masking_str, "wb")) # cPickle.dump(image_ids, open(eval_path + "_%s_imgids.pkl" % masking_str, "wb")) # cPickle.dump(phrase_ids, open(eval_path + "_%s_phrids.pkl" % masking_str, "wb")) cPickle.dump(lm_losses, open(eval_path + "_%s_mlm.pkl" % masking_str, "wb"))