image_field = ImageDetectionsField(detections_path=args.features_path, max_detections=50, load_in_tmp=False) # Pipeline for text text_field = TextField(init_token='<bos>', eos_token='<eos>', lower=True, tokenize='spacy', remove_punctuation=True, nopoints=False) # Create the dataset dataset = COCO(image_field, text_field, 'coco/images/', args.annotation_folder, args.annotation_folder) train_dataset, val_dataset, test_dataset = dataset.splits if not os.path.isfile('vocab_%s.pkl' % args.exp_name): print("Building vocabulary") text_field.build_vocab(train_dataset, val_dataset, min_freq=5) pickle.dump(text_field.vocab, open('vocab_%s.pkl' % args.exp_name, 'wb')) else: text_field.vocab = pickle.load(open('vocab_%s.pkl' % args.exp_name, 'rb')) # Model and dataloaders encoder = MemoryAugmentedEncoder(3, 0, attention_module=ScaledDotProductAttentionMemory, attention_module_kwargs={'m': args.m}) decoder = MeshedDecoder(len(text_field.vocab), 54, 3, text_field.vocab.stoi['<pad>']) model = Transformer(text_field.vocab.stoi['<bos>'], encoder, decoder).to(device) dict_dataset_train = train_dataset.image_dictionary({'image': image_field, 'text': RawField()}) ref_caps_train = list(train_dataset.text) cider_train = Cider(PTBTokenizer.tokenize(ref_caps_train)) dict_dataset_val = val_dataset.image_dictionary({'image': image_field, 'text': RawField()}) dict_dataset_test = test_dataset.image_dictionary({'image': image_field, 'text': RawField()}) def lambda_lr(s):
load_in_tmp=False) # Pipeline for text text_field = TextField(init_token='<bos>', eos_token='<eos>', lower=True, tokenize='spacy', remove_punctuation=True, nopoints=False) # Create the dataset dataset = ScanNet(image_field, text_field, "/cluster/sorona/dchen/ScanNet_frames/", get_image_ids(args.features_path)) _, _, test_dataset = dataset.splits text_field.vocab = pickle.load(open('vocab.pkl', 'rb')) # Model and dataloaders encoder = MemoryAugmentedEncoder( 3, 0, attention_module=ScaledDotProductAttentionMemory, attention_module_kwargs={'m': 40}) decoder = MeshedDecoder(len(text_field.vocab), 54, 3, text_field.vocab.stoi['<pad>']) model = Transformer(text_field.vocab.stoi['<bos>'], encoder, decoder).to(device) data = torch.load('meshed_memory_transformer.pth') model.load_state_dict(data['state_dict'])
lower=True, tokenize="spacy", remove_punctuation=True, nopoints=False, ) # Create the dataset dataset = COCO( image_field, text_field, "coco/images/", args.annotation_folder, args.annotation_folder, ) _, _, test_dataset = dataset.splits text_field.vocab = pickle.load(open("vocab.pkl", "rb")) # Model and dataloaders encoder = MemoryAugmentedEncoder( 3, 0, attention_module=ScaledDotProductAttentionMemory, attention_module_kwargs={"m": 40}, ) decoder = MeshedDecoder(len(text_field.vocab), 54, 3, text_field.vocab.stoi["<pad>"]) model = Transformer(text_field.vocab.stoi["<bos>"], encoder, decoder).to(device) data = torch.load("meshed_memory_transformer.pth") model.load_state_dict(data["state_dict"])
parser.add_argument('--d_in', type=int, default=2048) parser.add_argument('--vocab', type=str, default='vocab.pkl') args = parser.parse_args() print('Meshed-Memory Transformer Evaluation') # Pipeline for image regions image_field = ImageDetectionsField(detections_path=args.features_path, max_detections=50, load_in_tmp=False) # Pipeline for text text_field = TextField(init_token='<bos>', eos_token='<eos>', lower=True, tokenize='spacy', remove_punctuation=True, nopoints=False) # Create the dataset dataset = COCO(image_field, text_field, 'coco/images/', args.annotation_folder, args.annotation_folder) _, _, test_dataset = dataset.splits text_field.vocab = pickle.load(open(args.vocab, 'rb')) # Model and dataloaders encoder = MemoryAugmentedEncoder(3, 0, args.d_in, d_ff=args.d_in, attention_module=ScaledDotProductAttentionMemory, attention_module_kwargs={'m': 40}) decoder = MeshedDecoder(len(text_field.vocab), 54, 3, text_field.vocab.stoi['<pad>'], d_ff=args.d_in) model = Transformer(text_field.vocab.stoi['<bos>'], encoder, decoder).to(device) data = torch.load(args.weights) model.load_state_dict(data['state_dict']) dict_dataset_test = test_dataset.image_dictionary({'image': image_field, 'text': RawField()}) dict_dataloader_test = DataLoader(dict_dataset_test, batch_size=args.batch_size, num_workers=args.workers) scores = predict_captions(model, dict_dataloader_test, text_field) print(scores)