for client in clients: print(f"| Global Round: {epoch} | client index: {index} |") client.train(client_w) client_w = client.get_weight() index += 1 return client_w, server.get_weight() if __name__ == "__main__": args = parse_args() data_name = 'cifar10' if not args.cifar100 else 'cifar100' num_classes = 10 if not args.cifar100 else 100 TAG = 'multi-mixsl-mixsum' + str( args.mix_num) + '-' + data_name + '-' + args.name print(f'{TAG}: training start....') setup_seed(args.seed, True if args.gpu > -1 else False) logs = [] if args.cifar100: train_dataset, test_dataset = get_cifar100(args.balanced) else: train_dataset, test_dataset = get_cifar10(args.balanced) user_groups = random_avg_strategy(train_dataset, args.num_users) cls_num_per_clients = count_class_num_per_client(train_dataset, user_groups, 100) logs_file = TAG client_part = ResNet18_Extractor() server_part = ResNet18_Classifer(num_classes) #client_part = ResNet34_Extractor() # = ResNet34_Classifer(num_classes) #client_part, server_part = get_split_vgg16(num_classes)
def train(local_rank, args): setup_seed(args.seed) rank = args.nr * args.gpus + local_rank saved_model_dir, _ = os.path.split(args.checkpoint) if not os.path.isdir(saved_model_dir): os.makedirs(saved_model_dir) src_data, src_vocab = load_corpus_data(args.src_path, args.src_language, args.start_token, args.end_token, args.mask_token, args.src_vocab_path, args.rebuild_vocab, args.unk, args.threshold) tgt_data, tgt_vocab = load_corpus_data(args.tgt_path, args.tgt_language, args.start_token, args.end_token, args.mask_token, args.tgt_vocab_path, args.rebuild_vocab, args.unk, args.threshold) args.src_vocab_size = len(src_vocab) args.tgt_vocab_size = len(tgt_vocab) logging.info("Source language vocab size: {}".format(len(src_vocab))) logging.info("Target language vocab size: {}".format(len(tgt_vocab))) assert len(src_data) == len(tgt_data) if args.sort_sentence_by_length: src_data, tgt_data = sort_src_sentence_by_length( list(zip(src_data, tgt_data))) logging.info("Transformer") max_src_len = max(len(line) for line in src_data) max_tgt_len = max(len(line) for line in tgt_data) args.max_src_len = max_src_len args.max_tgt_len = max_tgt_len padding_value = src_vocab.get_index(args.mask_token) assert padding_value == tgt_vocab.get_index(args.mask_token) args.padding_value = padding_value logging.info("Multi GPU training") dist.init_process_group(backend="nccl", init_method=args.init_method, rank=rank, world_size=args.world_size) device = torch.device("cuda", local_rank) torch.cuda.set_device(device) if args.load: logging.info("Load existing model from {}".format(args.load)) s2s, optimizer_state_dict = load_transformer(args, training=True, device=device) s2s = nn.parallel.DistributedDataParallel(s2s, device_ids=[local_rank]) optimizer = get_optimizer(s2s.parameters(), args) optimizer.load_state_dict(optimizer_state_dict) else: logging.info("New model") s2s = build_transformer(args, device) s2s.init_parameters() s2s = nn.parallel.DistributedDataParallel(s2s, device_ids=[local_rank]) optimizer = get_optimizer(s2s.parameters(), args) s2s.train() if args.label_smoothing: logging.info("Label Smoothing!") criterion = LabelSmoothingLoss(args.label_smoothing, padding_value) else: criterion = nn.CrossEntropyLoss(ignore_index=padding_value) train_data = NMTDataset(src_data, tgt_data) # release cpu memory del src_data del tgt_data train_sampler = torch.utils.data.distributed.DistributedSampler( train_data, num_replicas=args.world_size, rank=rank) train_loader = DataLoader(train_data, args.batch_size, shuffle=False, sampler=train_sampler, drop_last=True, pin_memory=True, collate_fn=lambda batch: collate( batch, padding_value, batch_first=True)) for i in range(args.start_epoch, args.end_epoch): train_sampler.set_epoch(i) epoch_loss = 0.0 start_time = time.time() steps = 0 for j, (input_batch, target_batch) in enumerate(train_loader): if args.update_freq == 1: need_update = True else: need_update = True if (j + 1) % args.update_freq == 0 else False input_batch = input_batch.to(device, non_blocking=True) target_batch = target_batch.to(device, non_blocking=True) output = s2s(input_batch, target_batch[:, :-1]) del input_batch output = output.view(-1, output.size(-1)) target_batch = target_batch[:, 1:].contiguous().view(-1) batch_loss = criterion(output, target_batch) del target_batch del output # synchronize all processes # Gradient synchronization communications take place during the backward pass and overlap # with the backward computation. When the backward() returns, param.grad already contains # the synchronized gradient tensor. dist.barrier() batch_loss.backward() if need_update: optimizer.step() optimizer.zero_grad() batch_loss = batch_loss.item() epoch_loss += batch_loss steps += 1 if (steps + 1) % args.update_freq != 0: optimizer.step() optimizer.zero_grad() epoch_loss /= steps epoch_ppl = math.exp(epoch_loss) logging.info( "Epoch: {}, time: {} seconds, loss: {}, perplexity: {}, local rank: {}" .format(i, time.time() - start_time, epoch_loss, epoch_ppl, local_rank)) if local_rank == 0: torch.save(save_transformer(s2s, optimizer, args), "{}_{}_{}".format(args.checkpoint, i, steps)) torch.save(save_transformer(s2s, optimizer, args), args.checkpoint + "_rank{}".format(local_rank))
def train(args): setup_seed(args.seed) saved_model_dir, _ = os.path.split(args.checkpoint) if not os.path.isdir(saved_model_dir): os.makedirs(saved_model_dir) device = args.device torch.cuda.set_device(device) src_data, src_vocab = load_corpus_data(args.src_path, args.src_language, args.start_token, args.end_token, args.mask_token, args.src_vocab_path, args.rebuild_vocab, args.unk, args.threshold) tgt_data, tgt_vocab = load_corpus_data(args.tgt_path, args.tgt_language, args.start_token, args.end_token, args.mask_token, args.tgt_vocab_path, args.rebuild_vocab, args.unk, args.threshold) args.src_vocab_size = len(src_vocab) args.tgt_vocab_size = len(tgt_vocab) logging.info("Source language vocab size: {}".format(len(src_vocab))) logging.info("Target language vocab size: {}".format(len(tgt_vocab))) assert len(src_data) == len(tgt_data) if args.sort_sentence_by_length: src_data, tgt_data = sort_src_sentence_by_length( list(zip(src_data, tgt_data))) logging.info("Transformer") max_src_len = max(len(line) for line in src_data) max_tgt_len = max(len(line) for line in tgt_data) args.max_src_len = max_src_len args.max_tgt_len = max_tgt_len padding_value = src_vocab.get_index(args.mask_token) assert padding_value == tgt_vocab.get_index(args.mask_token) args.padding_value = padding_value if args.load: logging.info("Load existing model from {}".format(args.load)) s2s, optimizer_state_dict = load_transformer(args, training=True, device=device) optimizer = get_optimizer(s2s.parameters(), args) optimizer.load_state_dict(optimizer_state_dict) else: logging.info("New model") s2s = build_transformer(args, device) s2s.init_parameters() optimizer = get_optimizer(s2s.parameters(), args) s2s.train() if args.label_smoothing: criterion = LabelSmoothingLoss(args.label_smoothing, padding_value) else: criterion = nn.CrossEntropyLoss(ignore_index=padding_value) train_data = NMTDataset(src_data, tgt_data) # release cpu memory del src_data del tgt_data train_loader = DataLoader(train_data, args.batch_size, shuffle=True, pin_memory=True, collate_fn=lambda batch: collate( batch, padding_value, batch_first=True)) for i in range(args.start_epoch, args.end_epoch): epoch_loss = 0.0 start_time = time.time() steps = 0 for j, (input_batch, target_batch) in enumerate(train_loader): batch_loss = s2s.train_batch( input_batch.to(device, non_blocking=True), target_batch.to(device, non_blocking=True), criterion, optimizer, j, args.update_freq) epoch_loss += batch_loss steps += 1 if (steps + 1) % args.update_freq != 0: optimizer.step() optimizer.zero_grad() epoch_loss /= steps epoch_ppl = math.exp(epoch_loss) torch.save(save_transformer(s2s, optimizer, args), "{}_{}_{}".format(args.checkpoint, i, steps)) logging.info( "Epoch: {}, time: {} seconds, loss: {}, perplexity: {}".format( i, time.time() - start_time, epoch_loss, epoch_ppl))
def train(args): setup_seed(args.seed) device = torch.device(args.device) torch.cuda.set_device(device) src_data, src_vocab = load_corpus_data(args.src_path, args.src_language, args.start_token, args.end_token, args.mask_token, args.src_vocab_path, args.rebuild_vocab, args.unk, args.threshold) tgt_data, tgt_vocab = load_corpus_data(args.tgt_path, args.tgt_language, args.start_token, args.end_token, args.mask_token, args.tgt_vocab_path, args.rebuild_vocab, args.unk, args.threshold) logging.info("Source language vocab size: {}".format(len(src_vocab))) logging.info("Target language vocab size: {}".format(len(tgt_vocab))) assert len(src_data) == len(tgt_data) if args.sort_sentence_by_length: src_data, tgt_data = sort_src_sentence_by_length( list(zip(src_data, tgt_data))) if args.load: logging.info("Load existing model from {}".format(args.load)) s2s, optimizer_state_dict = load_model(args.load, training=True, device=device) optimizer = torch.optim.Adam(s2s.parameters(), args.learning_rate) optimizer.load_state_dict(optimizer_state_dict) else: if args.attention_size: logging.info("Attention Model") encoder = S2S_attention.Encoder(args.rnn_type, len(src_vocab), args.embedding_size, args.hidden_size, args.num_layers, args.dropout, args.bidirectional) attention = S2S_attention.BahdanauAttention( 2 * args.hidden_size if args.bidirectional else args.hidden_size, args.num_layers * (2 * args.hidden_size if args.bidirectional else args.hidden_size), args.attention_size) decoder = S2S_attention.AttentionDecoder( args.rnn_type, len(tgt_vocab), args.embedding_size, args.embedding_size + (2 * args.hidden_size if args.bidirectional else args.hidden_size), 2 * args.hidden_size if args.bidirectional else args.hidden_size, args.num_layers, attention, args.dropout) s2s = S2S_attention.S2S(encoder, decoder).to(device) else: logging.info("Basic Model") encoder = S2S_basic.Encoder(args.rnn_type, len(src_vocab), args.embedding_size, args.hidden_size, args.num_layers, args.dropout, args.bidirectional) decoder = S2S_basic.Decoder( args.rnn_type, len(tgt_vocab), args.embedding_size, 2 * args.hidden_size if args.bidirectional else args.hidden_size, args.num_layers, args.dropout) s2s = S2S_basic.S2S(encoder, decoder).to(device) optimizer = torch.optim.Adam(s2s.parameters(), args.learning_rate) s2s.train() padding_value = src_vocab.get_index(args.mask_token) assert padding_value == tgt_vocab.get_index(args.mask_token) if args.label_smoothing: criterion = LabelSmoothingLoss(args.label_smoothing, padding_value) else: criterion = nn.CrossEntropyLoss(ignore_index=padding_value) train_data = NMTDataset(src_data, tgt_data) # release cpu memory del src_data del tgt_data train_loader = DataLoader( train_data, args.batch_size, shuffle=True, pin_memory=True, collate_fn=lambda batch: collate(batch, padding_value)) for i in range(args.start_epoch, args.end_epoch): epoch_loss = 0.0 start_time = time.time() steps = 0 for j, (input_batch, target_batch) in enumerate(train_loader): batch_loss = s2s.train_batch( input_batch.to(device, non_blocking=True), target_batch.to(device, non_blocking=True), criterion, optimizer) epoch_loss += batch_loss steps += 1 epoch_loss /= steps torch.save(save_model(s2s, optimizer, args), "{}_{}_{}".format(args.checkpoint, i, steps)) logging.info("Epoch: {}, time: {} seconds, loss: {}".format( i, time.time() - start_time, epoch_loss))
def train(local_rank, args): setup_seed(args.seed) rank = args.nr * args.gpus + local_rank src_data, src_vocab = load_corpus_data(args.src_path, args.src_language, args.start_token, args.end_token, args.mask_token, args.src_vocab_path, args.rebuild_vocab, args.unk, args.threshold) tgt_data, tgt_vocab = load_corpus_data(args.tgt_path, args.tgt_language, args.start_token, args.end_token, args.mask_token, args.tgt_vocab_path, args.rebuild_vocab, args.unk, args.threshold) assert len(src_data) == len(tgt_data) if args.sort_sentence_by_length: src_data, tgt_data = sort_src_sentence_by_length( list(zip(src_data, tgt_data))) logging.info("Source language vocab size: {}".format(len(src_vocab))) logging.info("Target language vocab size: {}".format(len(tgt_vocab))) torch.distributed.init_process_group(backend="nccl", init_method=args.init_method, rank=rank, world_size=args.world_size) device = torch.device("cuda", local_rank) torch.cuda.set_device(device) if args.load: logging.info("Load existing model from {}".format(args.load)) s2s, optimizer_state_dict = load_model(args.load, training=True, device=device) s2s = nn.parallel.DistributedDataParallel(s2s, device_ids=[local_rank]) optimizer = torch.optim.Adam(s2s.parameters(), args.learning_rate) optimizer.load_state_dict(optimizer_state_dict) else: if args.attention_size: logging.info("Attention Model") encoder = S2S_attention.Encoder(args.rnn_type, len(src_vocab), args.embedding_size, args.hidden_size, args.num_layers, args.dropout, args.bidirectional) attention = S2S_attention.BahdanauAttention( 2 * args.hidden_size if args.bidirectional else args.hidden_size, args.num_layers * (2 * args.hidden_size if args.bidirectional else args.hidden_size), args.attention_size) decoder = S2S_attention.AttentionDecoder( args.rnn_type, len(tgt_vocab), args.embedding_size, args.embedding_size + (2 * args.hidden_size if args.bidirectional else args.hidden_size), 2 * args.hidden_size if args.bidirectional else args.hidden_size, args.num_layers, attention, args.dropout) s2s = S2S_attention.S2S(encoder, decoder).to(device) else: logging.info("Basic Model") encoder = S2S_basic.Encoder(args.rnn_type, len(src_vocab), args.embedding_size, args.hidden_size, args.num_layers, args.dropout, args.bidirectional) decoder = S2S_basic.Decoder( args.rnn_type, len(tgt_vocab), args.embedding_size, 2 * args.hidden_size if args.bidirectional else args.hidden_size, args.num_layers, args.dropout) s2s = S2S_basic.S2S(encoder, decoder).to(device) s2s = nn.parallel.DistributedDataParallel(s2s, device_ids=[local_rank]) optimizer = torch.optim.Adam(s2s.parameters(), args.learning_rate) s2s.train() logging.info("Multi Gpu training: {}".format(local_rank)) padding_value = src_vocab.get_index(args.mask_token) assert padding_value == tgt_vocab.get_index(args.mask_token) if args.label_smoothing: criterion = LabelSmoothingLoss(args.label_smoothing, padding_value) else: criterion = nn.CrossEntropyLoss(ignore_index=padding_value) train_data = NMTDataset(src_data, tgt_data) # release cpu memory del src_data del tgt_data train_sampler = torch.utils.data.distributed.DistributedSampler( train_data, num_replicas=args.world_size, rank=rank) train_loader = DataLoader( train_data, args.batch_size, shuffle=False, sampler=train_sampler, pin_memory=True, collate_fn=lambda batch: collate(batch, padding_value), drop_last=True) for i in range(args.start_epoch, args.end_epoch): train_sampler.set_epoch(i) epoch_loss = 0.0 start_time = time.time() steps = 0 for j, (input_batch, target_batch) in enumerate(train_loader): input_batch = input_batch.to(device, non_blocking=True) target_batch = target_batch.to(device, non_blocking=True) output = s2s(input_batch, target_batch) del input_batch # output: (input_length - 1, batch_size, vocab_size) output = torch.stack(output, dim=0) # output: ((input_length - 1) * batch_size, vocab_size) output = output.view(-1, output.size(-1)) # target_batch: ((input_length - 1), batch_size) target_batch = target_batch[1:].contiguous().view(-1) batch_loss = criterion(output, target_batch) del output del target_batch optimizer.zero_grad() # synchronize all processes dist.barrier() batch_loss.backward() optimizer.step() batch_loss = batch_loss.item() epoch_loss += batch_loss steps += 1 epoch_loss /= steps if local_rank == 0: torch.save(save_model(s2s, optimizer, args), "{}_{}_{}".format(args.checkpoint, i, steps)) logging.info( "Epoch: {}, time: {} seconds, loss: {}, local rank: {}".format( i, time.time() - start_time, epoch_loss, local_rank)) torch.save(save_model(s2s, optimizer, args), args.checkpoint + "_rank{}".format(local_rank))
def main(seed, args): '''main function''' setup_seed(seed) os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid) mode = args.mode dataset_name = args.dataset_name labelIdxStart0or1 = args.labelIdxStart0or1 root_dir = args.root_dir graph_datadir = args.graph_datadir all_visualFea_label_file = args.all_visualFea_label_file auxiliary_file = args.auxiliary_file batch_size = args.batch_size weight_decay = args.weight_decay use_pca = args.use_pca reduced_dim_pca = args.reduced_dim_pca zsl_dataset = ZSL_Dataset(root_dir, dataset_name, mode, all_visualFea_label_file, auxiliary_file, use_pca, reduced_dim_pca) zsl_dataloader = data.DataLoader(zsl_dataset, batch_size=batch_size, shuffle=True, num_workers=4) print('data is ready!') vi_fea_dim = zsl_dataset.vis_fea_dim se_fea_dim = zsl_dataset.sem_fea_dim n_tr_class = zsl_dataset.n_tr_class te_data_unseen, te_data_seen = zsl_dataset.get_testData() te_vis_fea_unseen, te_sem_fea_unseen, te_label_unseen, te_labelID_unseen, te_sem_fea_pro_unseen = te_data_unseen te_vis_fea_seen, te_sem_fea_seen, te_label_seen, te_labelID_seen, te_sem_fea_pro_seen = te_data_seen tr_vis_fea, tr_sem_fea, all_tr_label, tr_labelID, tr_sem_fea_pro = zsl_dataset.get_trainData() # for debugging all_labels = zsl_dataset.all_labels device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') save_subdir = dataset_name save_dir = os.path.join(args.save_dir, save_subdir) if not os.path.exists(save_dir): os.makedirs(save_dir) adj_matrix = load_graph(dataset_name, graph_datadir) weight_threshold = args.weight_threshold adj_lists = adjMatrix2adjLists(adj_matrix, weight_threshold) vi_fea_dim = zsl_dataset.vis_fea_dim se_fea_dim = zsl_dataset.sem_fea_dim use_z = args.use_z.lower() z_dim = args.z_dim if use_z == 'true': netG = _netG(se_fea_dim, vi_fea_dim, z_dim).to(device) else: netG = _netG2(se_fea_dim, vi_fea_dim).to(device) gan_checkpoint_dir = args.gan_checkpoint_dir gan_checkpoint_name = args.gan_checkpoint sem_fea_pro = zsl_dataset.all_prototype_semantic_feature unseen_classes = te_labelID_unseen n_gene_perC = args.n_gene_perC generated_vis_fea_dict = get_fake_unseen_visual_feat(netG, dataset_name, gan_checkpoint_dir, gan_checkpoint_name, use_z, z_dim, sem_fea_pro, unseen_classes, n_gene_perC=n_gene_perC) for k in generated_vis_fea_dict.keys(): gen_vis_fea_list = generated_vis_fea_dict[k] gen_vis_fea = np.vstack(gen_vis_fea_list) n_fake_instances = len(gen_vis_fea_list) assert gen_vis_fea.shape == (n_fake_instances, vi_fea_dim) tr_vis_fea = np.vstack((tr_vis_fea, gen_vis_fea)) gen_labels = [k for _ in range(n_fake_instances)] gen_labels = np.array(gen_labels) all_tr_label = np.hstack((all_tr_label, gen_labels)) assert len(tr_vis_fea) == len(all_tr_label) print('building dicts...') instanceIdx2classIdx = dict() classIdx2instanceIdx = defaultdict(set) for instanceIdx, classIdx in enumerate(all_labels): instanceIdx2classIdx[instanceIdx] = classIdx classIdx2instanceIdx[classIdx].add(instanceIdx) instanceIdx2classIdx_zsl_train = dict() classIdx2instanceIdx_zsl_train = defaultdict(set) for instanceIdx, classIdx in enumerate(all_tr_label): instanceIdx2classIdx_zsl_train[instanceIdx] = classIdx classIdx2instanceIdx_zsl_train[classIdx].add(instanceIdx) instanceIdx2classIdx_zsl_test_seen = dict() classIdx2instanceIdx_zsl_test_seen = defaultdict(set) for instanceIdx, classIdx in enumerate(te_label_seen): instanceIdx2classIdx_zsl_test_seen[instanceIdx] = classIdx classIdx2instanceIdx_zsl_test_seen[classIdx].add(instanceIdx) instanceIdx2classIdx_zsl_test_unseen = dict() classIdx2instanceIdx_zsl_test_unseen = defaultdict(set) for instanceIdx, classIdx in enumerate(te_label_unseen): instanceIdx2classIdx_zsl_test_unseen[instanceIdx] = classIdx classIdx2instanceIdx_zsl_test_unseen[classIdx].add(instanceIdx) print('build done!') # use visual feature as initial input firstHop_featureFunc = zsl_dataset.get_firstHop_featureFunc_visual_zsl_train() agg1 = MeanAggregator(firstHop_featureFunc).to(device) enc1 = Encoder(firstHop_featureFunc, vi_fea_dim, 128, adj_lists, agg1, instanceIdx2classIdx_zsl_train, classIdx2instanceIdx_zsl_train, instanceIdx2classIdx_zsl_test_seen, classIdx2instanceIdx_zsl_test_seen, instanceIdx2classIdx_zsl_test_unseen, classIdx2instanceIdx_zsl_test_unseen, classIdx2instanceIdx, generated_vis_fea_dict, mode='train', seen_labelID_set=tr_labelID, gcn_style=True).to(device) agg2 = MeanAggregator(lambda nodes : enc1(nodes).t()).to(device) enc2 = Encoder(lambda nodes : enc1(nodes).t(), enc1.embed_dim, 128, adj_lists, agg2, instanceIdx2classIdx_zsl_train, classIdx2instanceIdx_zsl_train, instanceIdx2classIdx_zsl_test_seen, classIdx2instanceIdx_zsl_test_seen, instanceIdx2classIdx_zsl_test_unseen, classIdx2instanceIdx_zsl_test_unseen, classIdx2instanceIdx, generated_vis_fea_dict, mode='train', seen_labelID_set=tr_labelID, base_model=enc1, gcn_style=True).to(device) enc1.num_samples = 10 enc2.num_samples = 10 nets = [agg1, enc1, agg2, enc2] n_classes = zsl_dataset.n_classes graphsage = SupervisedGraphSage(n_classes, enc1, enc1.embed_dim).to(device) nets_weights_init([graphsage]) lr = args.lr optimizer = torch.optim.Adam(filter(lambda p : p.requires_grad, graphsage.parameters()), lr=lr) lr_maker = lr_scheduler.StepLR(optimizer=optimizer, step_size=1000, gamma=0.9) print('start training...') best_acc_test_unseen = 0 best_acc_test_seen = 0 print_every = args.print_every eval_every = args.eval_every tr_rand_indices = np.random.permutation(len(all_tr_label)) te_seen_indices = np.array(list(range(len(te_label_seen)))) te_unseen_indices = np.array(list(range(len(te_label_unseen)))) eps = np.finfo(float).eps for batch in range(args.n_iteration): graphsage.train() lr_maker.step() batch_nodes = tr_rand_indices[:batch_size] random.shuffle(tr_rand_indices) # "cg" for "class-level graph" cg_loss = graphsage.loss(batch_nodes, torch.LongTensor(all_tr_label[np.array(batch_nodes)]).to(device)) batch_labels = all_tr_label[np.array(batch_nodes)] _, batch_embeddings, batch_sigma = graphsage(batch_nodes) batch_embeddings = torch.t(batch_embeddings) N = batch_embeddings.size(0) emb_dim = batch_embeddings.size(-1) support_ratio = 0.5 n_support = int(N * support_ratio) n_query = N - n_support s_labels = batch_labels[:n_support] q_labels = batch_labels[n_support:] s_labels = torch.from_numpy(s_labels).long() q_labels = torch.from_numpy(q_labels).long() total_n_classes = zsl_dataset.total_n_classes s_labels_onehot = torch.zeros(n_support, total_n_classes).scatter_(1, s_labels.view(-1, 1), 1).to(device) q_labels_onehot = torch.zeros(n_query, total_n_classes).scatter_(1, q_labels.view(-1, 1), 1).to(device) F, Fq = LP(batch_embeddings, batch_sigma, args.top_k, s_labels_onehot, total_n_classes) ce = nn.CrossEntropyLoss().to(device) gt = torch.argmax(torch.cat((s_labels_onehot, q_labels_onehot), 0), 1) meta_loss = ce(F, gt) # dual LP: combine the loss of both support and query optimizer.zero_grad() total_loss = cg_loss + args.lambda_lploss * meta_loss total_loss.backward() optimizer.step() # calculate acc predq = torch.argmax(Fq, 1) gtq = torch.argmax(q_labels_onehot, 1) correct = (predq == gtq).sum() total = n_query acc = 1.0 * correct.float() / float(total) if batch % print_every == 0 and batch > 0: print('iter: {:4d}/{} cg_loss: {:.6f} meta_loss: {:.6f} acc: {:.6f} lr: {:.8f}'.format(batch, args.n_iteration, cg_loss, meta_loss, acc, optimizer.param_groups[0]['lr'])) if batch % eval_every == 0 and batch > 0: graphsage.eval() ## test_seen graphsage.encoder.mode = 'test_seen' enc1.mode = 'test_seen' agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_test_seen() te_seen_output, test_seen_embeddings, _ = graphsage.forward(te_seen_indices) te_seen_output = te_seen_output.cpu() te_seen_acc = cls_acc(te_seen_output.data.numpy(), te_label_seen) cprint('te_seen_acc: {:.6f}'.format(te_seen_acc), 'yellow') if te_seen_acc > best_acc_test_seen: best_acc_test_seen = te_seen_acc ## test_unseen graphsage.encoder.mode = 'test_unseen' enc1.mode = 'test_unseen' agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_train() te_unseen_output, test_unseen_embeddings, _ = graphsage.forward(te_unseen_indices) te_unseen_output = te_unseen_output.cpu() te_unseen_acc = cls_acc(te_unseen_output.data.numpy(), te_label_unseen) cprint('te_unseen_acc: {:.6f}'.format(te_unseen_acc), 'yellow') if te_unseen_acc > best_acc_test_unseen: best_acc_test_unseen = te_unseen_acc save_dict = { 'iteration' : (batch + 1), 'state_dict': graphsage.state_dict(), 'acc_unseen': best_acc_test_unseen, 'acc_seen' : te_seen_acc, } checkpoint_name = 'checkpoint_' + dataset_name + '_iter' + str(batch + 1) + '_accUnseen%.4lf_accSeen%.4lf.pkl' % (best_acc_test_unseen, te_seen_acc) checkpoint_path = os.path.join(save_dir, checkpoint_name) cprint('saving ' + checkpoint_name + ' in ' + save_dir + '...', 'green') torch.save(save_dict, checkpoint_path) if args.use_LP_eval == 'true': ## support graphsage.encoder.mode = 'train' enc1.mode = 'train' agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_train() expanded_tr_labels = all_tr_label n_expanded_tr = expanded_tr_labels.shape[0] expanded_tr_labels = torch.from_numpy(expanded_tr_labels) expanded_tr_nodes = list(range(len(expanded_tr_labels))) expanded_tr_nodes = np.array(expanded_tr_nodes) _, expanded_tr_embeddings, expanded_tr_sigma = graphsage(expanded_tr_nodes) expanded_tr_embeddings = torch.t(expanded_tr_embeddings) n_support = batch_size * 4 # the more the better where the GPU memory allows (upper bound: the number of expanded training instances) eval_batch_size = int(batch_size / 2) n_query = eval_batch_size shuffled_expanded_tr_idxs = np.random.permutation(n_expanded_tr) shuffled_idxs = shuffled_expanded_tr_idxs[:n_support] eval_s_labels = expanded_tr_labels[shuffled_idxs] eval_s_embeds = expanded_tr_embeddings[shuffled_idxs] s_labels_onehot = torch.zeros(n_support, total_n_classes).scatter_(1, eval_s_labels.view(-1, 1), 1).to(device) ## test seen graphsage.encoder.mode = 'test_seen' enc1.mode = 'test_seen' agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_test_seen() n_te_seen = len(te_label_seen) eval_times__te_seen = n_te_seen // n_query te_seen_correct = 0 te_seen_total = 0 test_seen_embeddings = torch.t(test_seen_embeddings) for k in range(eval_times__te_seen): te_seen_q_embeds = test_seen_embeddings[(k * n_query) : ((k + 1) * n_query)] te_seen_q_labels = te_label_seen[(k * n_query) : ((k + 1) * n_query)] _embeds = torch.cat((eval_s_embeds, te_seen_q_embeds), 0) _sigma = graphsage.cal_sigma(_embeds) F, Fq = LP(_embeds, _sigma, args.top_k, s_labels_onehot, total_n_classes) te_seen_q_labels = torch.from_numpy(te_seen_q_labels).long() te_seen_q_labels_onehot = torch.zeros(n_query, total_n_classes).scatter_(1, te_seen_q_labels.view(-1, 1), 1).to(device) # calculate acc predq = torch.argmax(Fq, 1) gtq = torch.argmax(te_seen_q_labels_onehot, 1) te_seen_correct += (predq == gtq).sum() te_seen_total += n_query te_seen_acc_LP = 1.0 * te_seen_correct.float() / float(te_seen_total) cprint('[LP] te_seen_acc: {:.6f}'.format(te_seen_acc_LP), 'yellow') ## test unseen graphsage.encoder.mode = 'test_unseen' enc1.mode = 'test_unseen' agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_train() n_te_unseen = len(te_label_unseen) eval_times__te_unseen = n_te_unseen // n_query te_unseen_correct = 0 te_unseen_total = 0 test_unseen_embeddings = torch.t(test_unseen_embeddings) for k in range(eval_times__te_unseen): te_unseen_q_embeds = test_unseen_embeddings[(k * n_query) : ((k + 1) * n_query)] te_unseen_q_labels = te_label_unseen[(k * n_query) : ((k + 1) * n_query)] _embeds = torch.cat((eval_s_embeds, te_unseen_q_embeds), 0) _sigma = graphsage.cal_sigma(_embeds) F, Fq = LP(_embeds, _sigma, args.top_k, s_labels_onehot, total_n_classes) te_unseen_q_labels = torch.from_numpy(te_unseen_q_labels).long() te_unseen_q_labels_onehot = torch.zeros(n_query, total_n_classes).scatter_(1, te_unseen_q_labels.view(-1, 1), 1).to(device) # calculate acc predq = torch.argmax(Fq, 1) gtq = torch.argmax(te_unseen_q_labels_onehot, 1) te_unseen_correct += (predq == gtq).sum() te_unseen_total += n_query te_unseen_acc_LP = 1.0 * te_unseen_correct.float() / float(te_unseen_total) cprint('[LP] te_unseen_acc: {:.6f}'.format(te_unseen_acc_LP), 'red') # recover to the training mode graphsage.encoder.mode = 'train' enc1.mode = 'train' agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_train() print('Final:' + '%' * 50) graphsage.encoder.mode = 'test_seen' enc1.mode = 'test_seen' agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_test_seen() te_seen_output, embeddings, _ = graphsage.forward(te_seen_indices) te_seen_output = te_seen_output.cpu() te_seen_acc = cls_acc(te_seen_output.data.numpy(), te_label_seen) print('test_seen_acc: {:.6f}'.format(te_seen_acc)) if te_seen_acc > best_acc_test_seen: best_acc_test_seen = te_seen_acc print('best acc of test_seen data: {:.6f}'.format(best_acc_test_seen)) graphsage.encoder.mode = 'test_unseen' enc1.mode = 'test_unseen' agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_train() te_unseen_output, embeddings, _ = graphsage.forward(te_unseen_indices) te_unseen_output = te_unseen_output.cpu() te_unseen_acc = cls_acc(te_unseen_output.data.numpy(), te_label_unseen) print('test_UNseen_acc: {:.6f}'.format(te_unseen_acc)) if te_unseen_acc > best_acc_test_unseen: best_acc_test_unseen = te_unseen_acc print('best acc of test_UNseen data: {:.6f}'.format(best_acc_test_unseen)) print('%' * 56)