def main(): torch.manual_seed(1) data_path = sys.argv[1] options, _ = parse_args() train_path = data_path + 'train_set' test_path = data_path + 'test_set' trainset = ZSLDataset(train_path, use_irevnet=not options.use_resnet) testset = ZSLDataset(test_path, use_irevnet=not options.use_resnet) num_classes = trainset.classes.shape[0] classes_enum = torch.tensor(np.array(range(num_classes), dtype=np.int64)).cuda() dim_semantic = trainset[0]['class_embedding'].shape[0] dim_visual = trainset[0]['image_embedding'].shape[0] dim_attributes = trainset[0]['class_predicates'].shape[0] all_class_embeddings = torch.tensor(np.array( trainset.class_embeddings)).float().cuda() all_class_predicates = torch.tensor(np.array( trainset.class_predicates)).float().cuda() all_train_image_embeddings = torch.tensor( np.array(trainset.image_embeddings)).cuda().float() all_train_labels = torch.tensor( trainset.labels['class_id'].values).cuda() - 1 # Find median image embeddings for each class image_mean = all_train_image_embeddings.mean(0) all_train_image_embeddings = all_train_image_embeddings - image_mean mask = all_train_labels.unsqueeze(0) == classes_enum.unsqueeze(1) all_train_image_embeddings = torch.stack([ torch.median(all_train_image_embeddings[mask[i]], dim=0)[0] for i in set(all_train_labels.tolist()) ]) all_train_labels = all_train_labels.unique( ) # having reduced all image embeddings of a class to a single embedding we remove all duplicate labels here query_ids = set([testset[i]['class_id'] for i in range(len(testset))]) ids = list(i - 1 for i in query_ids) query_mask = np.zeros(num_classes) query_mask[ids] = 1 query_mask = torch.tensor(query_mask, dtype=torch.int64).cuda() v_to_s = DecoderAttributes(dim_source=dim_visual, dim_target1=dim_attributes, dim_target2=dim_semantic, width=512).cuda() s_to_v = EncoderAttributes(dim_source1=dim_semantic, dim_source2=dim_attributes, dim_target=dim_visual, width=512).cuda() if options.optimizer == 'adam': optimizer = torch.optim.Adam(list(v_to_s.parameters()) + list(s_to_v.parameters()), lr=options.learning_rate, betas=(0.9, 0.999), weight_decay=options.weight_decay) else: optimizer = torch.optim.SGD(list(v_to_s.parameters()) + list(s_to_v.parameters()), lr=options.learning_rate, momentum=options.momentum, weight_decay=options.weight_decay, nesterov=True) positive_part = torch.nn.ReLU().cuda() trainloader = DataLoader(trainset, batch_size=options.batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True) testloader = DataLoader(testset, batch_size=options.batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True) alpha1 = options.alphas[0] # triplet alpha2 = options.alphas[1] # surjection alpha3 = options.alphas[2] # l2 regularization gamma = options.gamma margin = options.margin # Main Loop for e in range(options.n_epochs): v_to_s = v_to_s.train() s_to_v = s_to_v.train() running_loss = 0 for i, sample in enumerate(trainloader): optimizer.zero_grad() batch_classes = sample['class_id'].cuda() - 1 batch_semantic = sample['class_embedding'].cuda().float() batch_predicates = sample['class_predicates'].cuda().float() e_hat = v_to_s(s_to_v(all_class_embeddings, all_class_predicates)) delta = (e_hat[1] - all_class_embeddings) surjection_loss = (delta * delta).sum(dim=-1).mean() delta = (e_hat[0] - all_class_predicates) surjection_loss = (1 - gamma) * surjection_loss + gamma * ( delta * delta).sum(dim=-1).mean() # Triplet loss in visual space same_class = all_train_labels.unsqueeze( 0) == batch_classes.unsqueeze(1) same_class = same_class.detach() v_out = s_to_v(batch_semantic, batch_predicates) d_matrix_v = dist_cos_matrix(v_out, all_train_image_embeddings) closest_negative, _ = (d_matrix_v + same_class.float() * 1e6).min(dim=-1) furthest_positive, _ = (d_matrix_v * same_class.float()).max(dim=-1) l2_loss = (v_out * v_out).sum(dim=-1).mean() loss = positive_part(furthest_positive - closest_negative + margin) loss = alpha1 * loss.mean( ) + alpha2 * surjection_loss + alpha3 * l2_loss loss.backward() optimizer.step() running_loss += loss.item() else: print('Training Loss epoch {0}: {1}'.format( e + 1, running_loss / len(trainloader))) if (e + 1) % 50 == 0: for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.7 print('-----------------------------') print('\nEvaluation on test data: \n') avg_accuracy = 0. n = 0 v_to_s = v_to_s.eval() s_to_v = s_to_v.eval() v_out = s_to_v(all_class_embeddings, all_class_predicates) with torch.no_grad(): for i, sample in enumerate(testloader): n += 1 batch_classes = sample['class_id'].cuda() - 1 batch_visual = sample['image_embedding'].cuda().float( ) - image_mean # Triplet loss in visual space d_matrix_v = dist_cos_matrix(batch_visual, v_out) same_class = classes_enum.unsqueeze(0) == batch_classes.unsqueeze( 1) same_class = same_class.detach() # find nearest neighbour c_hat = (d_matrix_v + (1 - query_mask).float() * 1e9).argmin(dim=-1) avg_accuracy += (c_hat == batch_classes).float().mean().item() avg_accuracy /= n print('Accuracy: {0}'.format(avg_accuracy)) print('-----------------------------')
def main(): torch.manual_seed(1) data_path = sys.argv[1] options, _ = parse_args() train_path = data_path + 'train_set' validation_path = data_path + 'val_set' train_set_1 = ZSLDataset(train_path + '_1', use_irevnet=not options.use_resnet) train_set_2 = ZSLDataset(train_path + '_2', use_irevnet=not options.use_resnet) train_set_3 = ZSLDataset(train_path + '_3', use_irevnet=not options.use_resnet) train_sets = [train_set_1, train_set_2, train_set_3] val_set_1 = ZSLDataset(validation_path + '_1', use_irevnet=not options.use_resnet) val_set_2 = ZSLDataset(validation_path + '_2', use_irevnet=not options.use_resnet) val_set_3 = ZSLDataset(validation_path + '_3', use_irevnet=not options.use_resnet) validation_sets = [val_set_1, val_set_2, val_set_3] for split in range(3): trainloader = DataLoader(train_sets[split], batch_size=options.batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True) testloader = DataLoader(validation_sets[split], batch_size=options.batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True) num_classes = train_sets[split].classes.shape[0] classes_enum = torch.tensor( np.array(range(num_classes), dtype=np.int64)).cuda() dim_semantic = train_sets[split][0]['class_embedding'].shape[0] dim_visual = train_sets[split][0]['image_embedding'].shape[0] dim_attributes = train_sets[split][0]['class_predicates'].shape[0] all_class_embeddings = torch.tensor( np.array(train_sets[split].class_embeddings)).float().cuda() all_class_predicates = torch.tensor( np.array(train_sets[split].class_predicates)).float().cuda() query_ids = set([ validation_sets[split][i]['class_id'] for i in range(len(validation_sets[split])) ]) ids = list(i - 1 for i in query_ids) query_mask = np.zeros(num_classes) query_mask[ids] = 1 query_mask = torch.tensor(query_mask, dtype=torch.int64).cuda() gamma = options.gamma alpha1 = options.alphas[0] # triplet alpha2 = options.alphas[1] # surjection alpha3 = options.alphas[2] # l2 regularization v_to_s = DecoderAttributes(dim_source=dim_visual, dim_target1=dim_attributes, dim_target2=dim_semantic, width=512).cuda() s_to_v = EncoderAttributes(dim_source1=dim_semantic, dim_source2=dim_attributes, dim_target=dim_visual, width=512).cuda() if options.optimizer == 'adam': optimizer = torch.optim.Adam(list(v_to_s.parameters()) + list(s_to_v.parameters()), lr=options.learning_rate, betas=(0.9, 0.999), weight_decay=options.weight_decay) else: optimizer = torch.optim.SGD(list(v_to_s.parameters()) + list(s_to_v.parameters()), lr=options.learning_rate, momentum=options.momentum, weight_decay=options.weight_decay, nesterov=True) for e in range(options.n_epochs): v_to_s = v_to_s.train() s_to_v = s_to_v.train() running_loss = 0 for i, sample in enumerate(trainloader): optimizer.zero_grad() batch_visual = sample['image_embedding'].cuda().float() batch_classes = sample['class_id'].cuda() - 1 e_hat = v_to_s( s_to_v(all_class_embeddings, all_class_predicates)) delta = (e_hat[1] - all_class_embeddings) surjection_loss = (delta * delta).sum(dim=-1).mean() delta = (e_hat[0] - all_class_predicates) surjection_loss = (1 - gamma) * surjection_loss + gamma * ( delta * delta).sum(dim=-1).mean() s_out = v_to_s(batch_visual) s_attr, s_word = s_out same_class = classes_enum.unsqueeze( 0) == batch_classes.unsqueeze(1) same_class = same_class.detach() d_matrix = (1 - gamma) * dist_matrix( s_word, all_class_embeddings) + gamma * dist_matrix( s_attr, all_class_predicates) l2_dist_loss = (d_matrix * same_class.float()).mean() l2_loss = (1 - gamma) * (s_word * s_word).sum(dim=-1).mean( ) + gamma * (s_attr * s_attr).sum(dim=-1).mean() loss = alpha1 * l2_dist_loss + alpha2 * surjection_loss + alpha3 * l2_loss loss.backward() optimizer.step() running_loss += loss.item() else: print('Training Loss epoch {0}: {1}'.format( e + 1, running_loss / len(trainloader))) if (e + 1) % 50 == 0: for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.7 if (e + 1) % 5 == 0: print('\n\n- Evaluation on epoch {}'.format(e + 1)) avg_accuracy = 0. avg_loss = 0. n = 0 v_to_s = v_to_s.eval() s_to_v = s_to_v.eval() with torch.no_grad(): for i, sample in enumerate(testloader): n += 1 batch_visual = sample['image_embedding'].float().cuda() batch_classes = sample['class_id'].cuda() - 1 s_out = v_to_s(batch_visual) s_attr, s_word = s_out same_class = classes_enum.unsqueeze( 0) == batch_classes.unsqueeze(1) same_class = same_class.detach() d_matrix = (1 - gamma) * dist_matrix( s_word, all_class_embeddings ) + gamma * dist_matrix(s_attr, all_class_predicates) c_hat = (d_matrix + (1 - query_mask).float() * 1e9).argmin(dim=-1) closest_negative, _ = (d_matrix + same_class.float() * 1e6).min( dim=-1) furthest_positive, _ = (d_matrix * same_class.float()).max(dim=-1) loss = alpha1 * furthest_positive.mean() avg_loss += loss.item() avg_accuracy += ( c_hat == batch_classes).float().mean().item() avg_accuracy /= n avg_loss /= n print('Average acc.: {}, Average loss:{}\n\n'.format( avg_accuracy, avg_loss)) print('Split {0} done.'.format(split + 1))
def main(): options = parse_args() # Load Data if options.leonhard: train_path = 'ZSL_Data/AwA2_train' test_path = 'ZSL_Data/AwA2_test' else: train_path = 'Data/AwA2/train_set' test_path = 'Data/AwA2/test_set' trainset = ZSLDataset(train_path, use_predicates=True, use_irevnet=options.use_irevnet) testset = ZSLDataset(test_path, use_predicates=True, use_irevnet=options.use_irevnet) num_classes = trainset.classes.shape[0] dim_semantic = trainset[0]['class_embedding'].shape[0] dim_visual = trainset[0]['image_embedding'].shape[0] dim_attributes = trainset[0]['class_predicates'].shape[0] all_class_embeddings = torch.tensor(np.array( trainset.class_embeddings)).float().cuda() all_class_predicates = torch.tensor(np.array( trainset.class_predicates)).float().cuda() classes_enum = torch.tensor(np.array(range(num_classes), dtype=np.int64)).cuda() query_ids = set([testset[i]['class_id'] for i in range(len(testset))]) ids = list(i - 1 for i in query_ids) query_mask = np.zeros(num_classes) query_mask[ids] = 1 query_mask = torch.tensor(query_mask, dtype=torch.int64).cuda() v_to_s = DecoderAttributes(dim_source=dim_visual, dim_target1=dim_attributes, dim_target2=dim_semantic, width=512).cuda() s_to_v = EncoderAttributes(dim_source1=dim_semantic, dim_source2=dim_attributes, dim_target=dim_visual, width=512).cuda() if options.optimizer == 'adam': optimizer = torch.optim.Adam(list(v_to_s.parameters()) + list(s_to_v.parameters()), lr=options.learning_rate, betas=(0.9, 0.999), weight_decay=3e-3) else: optimizer = torch.optim.SGD(list(v_to_s.parameters()) + list(s_to_v.parameters()), lr=options.learning_rate, momentum=options.momentum, weight_decay=5e-3, nesterov=True) trainloader = DataLoader(trainset, batch_size=options.batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True) testloader = DataLoader(testset, batch_size=options.batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True) gamma = options.gamma alpha1 = options.alphas[0] # l2 alpha2 = options.alphas[1] # surjection alpha3 = options.alphas[2] # l2 regularization validation_accuracy = [] for e in range(options.n_epochs): v_to_s = v_to_s.train() s_to_v = s_to_v.train() running_loss = 0 for i, sample in enumerate(trainloader): optimizer.zero_grad() batch_visual = sample['image_embedding'].cuda().float() batch_classes = sample['class_id'].cuda() - 1 e_hat = v_to_s(s_to_v(all_class_embeddings, all_class_predicates)) delta = (e_hat[1] - all_class_embeddings) surjection_loss = (delta * delta).sum(dim=-1).mean() delta = (e_hat[0] - all_class_predicates) surjection_loss = (1 - gamma) * surjection_loss + gamma * ( delta * delta).sum(dim=-1).mean() s_out = v_to_s(batch_visual) s_attr, s_word = s_out same_class = classes_enum.unsqueeze(0) == batch_classes.unsqueeze( 1) same_class = same_class.detach() d_matrix = (1 - gamma) * dist_matrix( s_word, all_class_embeddings) + gamma * dist_matrix( s_attr, all_class_predicates) l2_dist_loss = (d_matrix * same_class.float()).mean() l2_loss = (1 - gamma) * (s_word * s_word).sum( dim=-1).mean() + gamma * (s_attr * s_attr).sum(dim=-1).mean() loss = alpha1 * l2_dist_loss + alpha2 * surjection_loss + alpha3 * l2_loss loss.backward() optimizer.step() running_loss += loss.item() else: print('Training Loss epoch {0}: {1}'.format( e, running_loss / len(trainloader))) if (e + 1) % 70 == 0: for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.7 if (e + 1) % 5 == 0: print('\n\n- Evaluation on epoch {}'.format(e)) avg_accuracy = 0. avg_loss = 0. n = 0 v_to_s = v_to_s.eval() s_to_v = s_to_v.eval() with torch.no_grad(): for i, sample in enumerate(testloader): n += 1 batch_visual = sample['image_embedding'].cuda().float() batch_classes = sample['class_id'].cuda() - 1 s_out = v_to_s(batch_visual) s_attr, s_word = s_out same_class = classes_enum.unsqueeze( 0) == batch_classes.unsqueeze(1) same_class = same_class.detach() d_matrix = (1 - gamma) * dist_matrix( s_word, all_class_embeddings) + gamma * dist_matrix( s_attr, all_class_predicates) c_hat = (d_matrix + (1 - query_mask).float() * 1e9).argmin(dim=-1) l2_dist_loss = (d_matrix * same_class.float()).mean() loss = alpha1 * l2_dist_loss avg_loss += loss.item() avg_accuracy += ( c_hat == batch_classes).float().mean().item() avg_accuracy /= n avg_loss /= n if e > 50: validation_accuracy.append(avg_accuracy) print('Average acc.: {}, Average loss:{}\n\n'.format( avg_accuracy, avg_loss)) print('Mean Accuracy: {0}'.format(np.mean(validation_accuracy)))