def __init__(self, train_image_dir, train_question_path, train_annotation_path, test_image_dir, test_question_path, test_annotation_path, batch_size, num_epochs, num_data_loader_workers): train_dataset = VqaDataset( image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}_resnet_feature.npy") val_dataset = VqaDataset( image_dir=test_image_dir, question_json_file_path=test_question_path, annotation_json_file_path=test_annotation_path, image_filename_pattern="COCO_val2014_{}_resnet_feature.npy") model = CoattentionNet(n_emb=512, n_img=2048, n_ques=len(train_dataset.dictionary), n_ans=len(train_dataset.answers)) super().__init__(train_dataset, val_dataset, model, batch_size, num_epochs, num_data_loader_workers=num_data_loader_workers) self.criterion = nn.CrossEntropyLoss() self.optimizer = optim.RMSprop(self._model.parameters(), lr=4e-4, alpha=0.99, eps=1e-8)
def __init__(self, train_image_dir, train_question_path, train_annotation_path, test_image_dir, test_question_path,test_annotation_path, batch_size, num_epochs, num_data_loader_workers, preprocessing): train_h5_path = "./features/resnet/train/train_feat_resnet_{}.h5" test_h5_path = "./features/resnet/test/test_feat_resnet_{}.h5" train_dataset = VqaDataset(image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", existing_format=None, prepro=preprocessing, prepro_path=train_h5_path) val_dataset = VqaDataset(image_dir=test_image_dir, question_json_file_path=test_question_path, annotation_json_file_path=test_annotation_path, image_filename_pattern="COCO_val2014_{}.jpg", existing_format=train_dataset, prepro=preprocessing, prepro_path=test_h5_path) embed_size = 512 vocab_size = train_dataset.quesVecSize ans_size = train_dataset.ansVecSize seq_len = train_dataset.seq_len self._model = CoattentionNet(embed_size, vocab_size, ans_size, seq_len) super().__init__(train_dataset, val_dataset, self._model, batch_size, num_epochs, num_data_loader_workers, preprocessing) # self.optimizer = torch.optim.RMSprop(self._model.parameters(), lr=4e-4, momentum=0.99, weight_decay=1e-8) # self.optimizer = torch.optim.SGD(self._model.parameters(), lr=1e-3, momentum=0.9, weight_decay=1e-8) self.optimizer = torch.optim.Adam(self._model.parameters())
def __init__(self, train_image_dir, train_question_path, train_annotation_path, test_image_dir, test_question_path, test_annotation_path, batch_size, num_epochs, num_data_loader_workers, args): if not args.debug: train_dataset = VqaDataset( image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", ) val_dataset = VqaDataset( image_dir=test_image_dir, question_json_file_path=test_question_path, annotation_json_file_path=test_annotation_path, image_filename_pattern="COCO_val2014_{}.jpg", ) model = SimpleBaselineNet(num_ans_candidates=2185, ntoken=train_dataset.dictionary.ntoken) else: train_dataset = VqaDataset( image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", debug=True) val_dataset = train_dataset model = SimpleBaselineNet(num_ans_candidates=2, ntoken=train_dataset.dictionary.ntoken) super().__init__(train_dataset, val_dataset, model, batch_size, num_epochs, num_data_loader_workers, args.use_cuda, 'simple_baseline') self.optim = torch.optim.Adamax(self._model.parameters())
def __init__(self, train_image_dir, train_question_path, train_annotation_path, test_image_dir, test_question_path, test_annotation_path, batch_size, num_epochs, num_data_loader_workers): self.vqa_loader = VQA(annotation_file=train_annotation_path, question_file=train_question_path) self.entries = self.vqa_loader.qqa self.qa = self.vqa_loader.qa bag_word_question = self.get_bag_of_word_question() bag_word_answer = self.get_bag_of_word_answer() pdb.set_trace() train_dataset = VqaDataset( image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", bag_word_question=bag_word_question, bag_word_answer=bag_word_answer) val_dataset = VqaDataset( image_dir=test_image_dir, question_json_file_path=test_question_path, annotation_json_file_path=test_annotation_path, image_filename_pattern="COCO_val2014_{}.jpg", bag_word_question=bag_word_question, bag_word_answer=bag_word_answer) num_question = train_dataset.bag_size_question num_answer = train_dataset.bag_size_answer model = SimpleBaselineNet(num_question, num_answer) # could be added outside # lr = 0.01 # momentum = 0.9 # pdb.set_trace() # self.optimizer = torch.optim.SGD( # model.parameters(), lr=lr, momentum=momentum) self.optimizer = torch.optim.SGD( [{ 'params': model.fc.parameters() }, { 'params': model.feature.parameters() }, { 'params': model.embedding.parameters(), 'lr': 0.8 }], lr=0.01, momentum=0.9) self.criterion = nn.CrossEntropyLoss().cuda() super().__init__(train_dataset, val_dataset, model, batch_size, num_epochs, num_data_loader_workers)
def __init__(self, train_image_dir, train_question_path, train_annotation_path, test_image_dir, test_question_path, test_annotation_path, batch_size, num_epochs, num_data_loader_workers, cache_location, lr, log_validation): ############ 2.3 TODO: set up transform transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) ############ train_dataset = VqaDataset( image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", transform=transform, ############ 2.4 TODO: fill in the arguments question_word_to_id_map=None, answer_to_id_map=None, ############ ) val_dataset = VqaDataset( image_dir=test_image_dir, question_json_file_path=test_question_path, annotation_json_file_path=test_annotation_path, image_filename_pattern="COCO_val2014_{}.jpg", transform=transform, ############ 2.4 TODO: fill in the arguments question_word_to_id_map=train_dataset.question_word_to_id_map, answer_to_id_map=train_dataset.answer_to_id_map, ############ ) model = SimpleBaselineNet() super().__init__(train_dataset, val_dataset, model, batch_size, num_epochs, num_data_loader_workers) ############ 2.5 TODO: set up optimizer #self.optimizer = torch.optim.SGD([{'params':model.WordNet.parameters(), 'lr':0.8}, # {'params':model.LinearLayer.parameters(), 'lr':0.01}]) ############ self.optimizer = torch.optim.SGD([{ 'params': model.WordNet.parameters(), 'lr': 0.01 }, { 'params': model.LinearLayer.parameters(), 'lr': 0.8 }])
def __init__(self, train_image_dir, train_question_path, train_annotation_path, test_image_dir, test_question_path, test_annotation_path, batch_size, num_epochs, num_data_loader_workers, cache_location, lr, log_validation): ############ 3.1 TODO: set up transform and image encoder transform = None image_encoder = None ############ question_word_list_length = 5746 answer_list_length = 1000 train_dataset = VqaDataset( image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", transform=transform, question_word_list_length=question_word_list_length, answer_list_length=answer_list_length, cache_location=os.path.join(cache_location, "tmp_train"), ############ 3.1 TODO: fill in the arguments question_word_to_id_map='change this argument', answer_to_id_map='change this argument', ############ pre_encoder=image_encoder) val_dataset = VqaDataset( image_dir=test_image_dir, question_json_file_path=test_question_path, annotation_json_file_path=test_annotation_path, image_filename_pattern="COCO_val2014_{}.jpg", transform=transform, question_word_list_length=question_word_list_length, answer_list_length=answer_list_length, cache_location=os.path.join(cache_location, "tmp_val"), ############ 3.1 TODO: fill in the arguments question_word_to_id_map='change this argument', answer_to_id_map='change this argument', ############ pre_encoder=image_encoder) self._model = CoattentionNet() super().__init__(train_dataset, val_dataset, self._model, batch_size, num_epochs, num_data_loader_workers=num_data_loader_workers, log_validation=False)
def __init__(self, train_image_dir, train_question_path, train_annotation_path, test_image_dir, test_question_path, test_annotation_path, batch_size, num_epochs, num_data_loader_workers): self.vqa_loader = VQA(annotation_file=train_annotation_path, question_file=train_question_path) self.entries = self.vqa_loader.qqa self.qa = self.vqa_loader.qa bag_word_question = self.get_bag_of_word_question() bag_word_answer = self.get_bag_of_word_answer() train_dataset = VqaDataset(image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", bag_word_question=bag_word_question, bag_word_answer=bag_word_answer, img_dir="./data_val/train.hdf5") val_dataset = VqaDataset(image_dir=test_image_dir, question_json_file_path=test_question_path, annotation_json_file_path=test_annotation_path, image_filename_pattern="COCO_val2014_{}.jpg", bag_word_question=bag_word_question, bag_word_answer=bag_word_answer, img_dir="./data_val/val.hdf5") num_question = train_dataset.bag_size_question num_answer = train_dataset.bag_size_answer # pdb.set_trace() max_len_train = train_dataset.max_len max_len_val = val_dataset.max_len print('max_len for train:{}, max_len for val:{}'.format( max_len_train, max_len_val)) self._model = CoattentionNet(num_question, num_answer, 26) # self.optimizer = torch.optim.SGD( # self._model.parameters(), lr=0.001, momentum=0.9) # pdb.set_trace() self.optimizer = torch.optim.Adam( self._model.parameters(), lr=4e-4, eps=1e-8) self.criterion = nn.CrossEntropyLoss().cuda() super().__init__(train_dataset, val_dataset, self._model, batch_size, num_epochs, num_data_loader_workers=num_data_loader_workers)
def __init__(self, train_image_dir, train_question_path, train_annotation_path, test_image_dir, test_question_path, test_annotation_path, batch_size, num_epochs, num_data_loader_workers): train_image_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) val_image_transform = transforms.Compose([ transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) train_dataset = VqaDataset( image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", is_training=True, transform=train_image_transform) val_dataset = VqaDataset( image_dir=test_image_dir, question_json_file_path=test_question_path, annotation_json_file_path=test_annotation_path, image_filename_pattern="COCO_val2014_{}.jpg", is_training=False, transform=val_image_transform) model = SimpleBaselineNet(len(train_dataset.dictionary), len(train_dataset.answers)) super().__init__(train_dataset, val_dataset, model, batch_size, num_epochs, num_data_loader_workers) self.criterion = nn.CrossEntropyLoss() self.optimizer = optim.SGD([{ 'params': model.ques_feat.parameters(), 'lr': 0.8 }, { 'params': model.fc.parameters() }], lr=0.01, momentum=0.9)
def test_load_dataset(self): """ This method gives you a quick way to run your dataset to make sure it loads files correctly. It doesn't assert a particular result from indexing the dataset; that will depend on your design. Feel free to fill in more asserts here, to validate your design. """ # Arrange current_dir = os.path.dirname(__file__) question_file = os.path.join(current_dir, "test_questions.json") annotation_file = os.path.join(current_dir, "test_annotations.json") vqa_dataset = VqaDataset( question_json_file_path=question_file, annotation_json_file_path=annotation_file, image_dir=current_dir, image_filename_pattern="COCO_train2014_{}.jpg", img_features_dir="features/img_train", vocab_json_filename="features/vocab.json") # Act vqa_len = len(vqa_dataset) dataset_item = vqa_dataset[0] # Assert self.assertEqual(vqa_len, 2) self.assertTrue(type(dataset_item) is dict)
def test_use_dataset_loader(self): """ Verify that the dataset can be successfully loaded using the DatasetLoader class. """ # Arrange current_dir = os.path.dirname(__file__) question_file = os.path.join(current_dir, "test_questions.json") annotation_file = os.path.join(current_dir, "test_annotations.json") vqa_dataset = VqaDataset( question_json_file_path=question_file, annotation_json_file_path=annotation_file, image_dir=current_dir, image_filename_pattern="COCO_train2014_{}.jpg", ques_thres=0, ans_thres=0, seq_len=20) dataset_loader = DataLoader(vqa_dataset, batch_size=2) # Act & Assert - the test will fail if iterating through the data loader fails for id, data in enumerate(dataset_loader): # Not doing anything here. Feel free to fill this in, if you like. # Test the image visualizations img = data['images'][1, :, :, :] img = img.numpy().transpose((1, 2, 0)) plt.imshow(img) plt.show() # Test BoW representations # print("answers ",data['answers']) print("questions ", data['questions']) print("questions size", data['questions'].shape) print("gt_answer ", data['gt_answer'])
def __init__(self, train_image_dir, train_question_path, train_annotation_path, test_image_dir, test_question_path, test_annotation_path, batch_size, num_epochs, num_data_loader_workers, preprocessing): train_h5_path = "./features/train_feat_googlenet.h5" test_h5_path = "./features/test_feat_googlenet.h5" embedding_size = 1024 train_dataset = VqaDataset( image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", existing_format=None, prepro=preprocessing, prepro_path=train_h5_path) val_dataset = VqaDataset( image_dir=test_image_dir, question_json_file_path=test_question_path, annotation_json_file_path=test_annotation_path, image_filename_pattern="COCO_val2014_{}.jpg", existing_format=train_dataset, prepro=preprocessing, prepro_path=test_h5_path) model = SimpleBaselineNet(vocab_size=train_dataset.quesVecSize, embedding_size=embedding_size, ans_size=train_dataset.ansVecSize) super().__init__(train_dataset, val_dataset, model, batch_size, num_epochs, num_data_loader_workers, preprocessing) self.optimizer = torch.optim.SGD( [{ 'params': model.embedding.parameters(), 'lr': 0.8 }, { 'params': model.softmax.parameters() }, { 'params': model.linear.parameters() }], lr=1e-2, momentum=0.9)
def __init__(self, train_image_dir, train_question_path, train_annotation_path, train_img_feat_path, test_image_dir, test_question_path, test_annotation_path, test_img_feat_path, vocab_path, batch_size, num_epochs, num_data_loader_workers): train_dataset = VqaDataset( image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", img_features_dir=train_img_feat_path, vocab_json_filename=vocab_path) val_dataset = VqaDataset( image_dir=test_image_dir, question_json_file_path=test_question_path, annotation_json_file_path=test_annotation_path, image_filename_pattern="COCO_val2014_{}.jpg", img_features_dir=test_img_feat_path, vocab_json_filename=vocab_path) img_feat_size = 1024 #TODO Better way to do this ques_embedding_lr = 0.8 classifier_lr = 0.01 q_vocab_size = train_dataset.q_vocab_size a_vocab_size = train_dataset.a_vocab_size model = SimpleBaselineNet(img_feat_size, q_vocab_size, a_vocab_size) self.optimizer = torch.optim.SGD([{ 'params': model.fc_ques.parameters(), 'lr': ques_embedding_lr }, { 'params': model.classifier.parameters(), 'lr': classifier_lr }]) self.criterion = torch.nn.CrossEntropyLoss() super().__init__(train_dataset, val_dataset, model, batch_size, num_epochs, num_data_loader_workers)
def __init__(self, train_image_dir, train_question_path, train_annotation_path, train_img_feat_path, test_image_dir, test_question_path, test_annotation_path, test_img_feat_path, vocab_path, batch_size, num_epochs, num_data_loader_workers): train_dataset = VqaDataset( image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", img_features_dir=train_img_feat_path, vocab_json_filename=vocab_path) val_dataset = VqaDataset( image_dir=test_image_dir, question_json_file_path=test_question_path, annotation_json_file_path=test_annotation_path, image_filename_pattern="COCO_val2014_{}.jpg", img_features_dir=test_img_feat_path, vocab_json_filename=vocab_path) img_feat_size = 512 #TODO Better way to do this embedding_size = 512 q_vocab_size = train_dataset.q_vocab_size a_vocab_size = train_dataset.a_vocab_size self._model = CoattentionNet(img_feat_size, embedding_size, q_vocab_size, a_vocab_size).cuda() params = self._model.parameters() # self.optimizer = torch.optim.RMSprop(params=params, lr=4e-4, weight_decay=1e-8, momentum=0.99) self.optimizer = torch.optim.Adam(params=params, lr=1e-4) self.criterion = torch.nn.CrossEntropyLoss().cuda() super().__init__(train_dataset, val_dataset, self._model, batch_size, num_epochs, num_data_loader_workers=num_data_loader_workers)
def save_features(args): transform = transforms.Compose([ transforms.Resize((448, 448)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) train_dataset = VqaDataset(image_dir=args.train_image_dir, question_json_file_path=args.train_question_path, annotation_json_file_path=args.train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", is_training=True, transform=transform) val_dataset = VqaDataset(image_dir=args.test_image_dir, question_json_file_path=args.test_question_path, annotation_json_file_path=args.test_annotation_path, image_filename_pattern="COCO_val2014_{}.jpg", is_training=False, transform=transform) train_data = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_data_loader_workers) val_data = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_data_loader_workers) model = models.resnet152(pretrained=True) model = nn.Sequential(*list(model.children())[:-2]) if torch.cuda.is_available(): model = model.cuda() model.eval() for batch_id, batch_data in enumerate(train_data): print('Training data {}/{}'.format(batch_id, len(train_data))) extract_features(model, batch_data, os.path.join(args.output_path, 'train2014')) for batch_id, batch_data in enumerate(val_data): print('Validation data {}/{}'.format(batch_id, len(train_data))) extract_features(model, batch_data, os.path.join(args.output_path, 'val2014'))
def __init__(self, train_image_dir, train_question_path, train_annotation_path, test_image_dir, test_question_path, test_annotation_path, batch_size, num_epochs, num_data_loader_workers, cache_location, lr, log_validation): ############ 2.3 TODO: set up transform transform = None ############ train_dataset = VqaDataset( image_dir=train_image_dir, question_json_file_path=train_question_path, annotation_json_file_path=train_annotation_path, image_filename_pattern="COCO_train2014_{}.jpg", transform=transform, ############ 2.4 TODO: fill in the arguments question_word_to_id_map='change this argument', answer_to_id_map='change this argument', ############ ) val_dataset = VqaDataset( image_dir=test_image_dir, question_json_file_path=test_question_path, annotation_json_file_path=test_annotation_path, image_filename_pattern="COCO_val2014_{}.jpg", transform=transform, ############ 2.4 TODO: fill in the arguments question_word_to_id_map='change this argument', answer_to_id_map='change this argument', ############ ) model = SimpleBaselineNet() super().__init__(train_dataset, val_dataset, model, batch_size, num_epochs, num_data_loader_workers)
def test_use_dataset_loader(self): """ Verify that the dataset can be successfully loaded using the DatasetLoader class. """ # Arrange current_dir = os.path.dirname(__file__) question_file = os.path.join(current_dir, "test_questions.json") annotation_file = os.path.join(current_dir, "test_annotations.json") vqa_dataset = VqaDataset( question_json_file_path=question_file, annotation_json_file_path=annotation_file, image_dir=current_dir, image_filename_pattern="COCO_train2014_{}.jpg") dataset_loader = DataLoader(vqa_dataset, batch_size=2) # Act & Assert - the test will fail if iterating through the data loader fails for id, data in enumerate(dataset_loader): # Not doing anything here. Feel free to fill this in, if you like. pass