Example #1
0
        for client in clients:
            print(f"| Global Round: {epoch} | client index: {index} |")
            client.train(client_w)
            client_w = client.get_weight()
            index += 1
        return client_w, server.get_weight()


if __name__ == "__main__":
    args = parse_args()
    data_name = 'cifar10' if not args.cifar100 else 'cifar100'
    num_classes = 10 if not args.cifar100 else 100
    TAG = 'multi-mixsl-mixsum' + str(
        args.mix_num) + '-' + data_name + '-' + args.name
    print(f'{TAG}: training start....')
    setup_seed(args.seed, True if args.gpu > -1 else False)
    logs = []
    if args.cifar100:
        train_dataset, test_dataset = get_cifar100(args.balanced)
    else:
        train_dataset, test_dataset = get_cifar10(args.balanced)
    user_groups = random_avg_strategy(train_dataset, args.num_users)
    cls_num_per_clients = count_class_num_per_client(train_dataset,
                                                     user_groups, 100)
    logs_file = TAG

    client_part = ResNet18_Extractor()
    server_part = ResNet18_Classifer(num_classes)
    #client_part = ResNet34_Extractor()
    # = ResNet34_Classifer(num_classes)
    #client_part, server_part = get_split_vgg16(num_classes)
def train(local_rank, args):

    setup_seed(args.seed)

    rank = args.nr * args.gpus + local_rank

    saved_model_dir, _ = os.path.split(args.checkpoint)

    if not os.path.isdir(saved_model_dir):
        os.makedirs(saved_model_dir)

    src_data, src_vocab = load_corpus_data(args.src_path, args.src_language,
                                           args.start_token, args.end_token,
                                           args.mask_token,
                                           args.src_vocab_path,
                                           args.rebuild_vocab, args.unk,
                                           args.threshold)

    tgt_data, tgt_vocab = load_corpus_data(args.tgt_path, args.tgt_language,
                                           args.start_token, args.end_token,
                                           args.mask_token,
                                           args.tgt_vocab_path,
                                           args.rebuild_vocab, args.unk,
                                           args.threshold)

    args.src_vocab_size = len(src_vocab)
    args.tgt_vocab_size = len(tgt_vocab)

    logging.info("Source language vocab size: {}".format(len(src_vocab)))
    logging.info("Target language vocab size: {}".format(len(tgt_vocab)))

    assert len(src_data) == len(tgt_data)

    if args.sort_sentence_by_length:
        src_data, tgt_data = sort_src_sentence_by_length(
            list(zip(src_data, tgt_data)))

    logging.info("Transformer")

    max_src_len = max(len(line) for line in src_data)
    max_tgt_len = max(len(line) for line in tgt_data)

    args.max_src_len = max_src_len
    args.max_tgt_len = max_tgt_len

    padding_value = src_vocab.get_index(args.mask_token)

    assert padding_value == tgt_vocab.get_index(args.mask_token)
    args.padding_value = padding_value

    logging.info("Multi GPU training")

    dist.init_process_group(backend="nccl",
                            init_method=args.init_method,
                            rank=rank,
                            world_size=args.world_size)

    device = torch.device("cuda", local_rank)

    torch.cuda.set_device(device)

    if args.load:

        logging.info("Load existing model from {}".format(args.load))
        s2s, optimizer_state_dict = load_transformer(args,
                                                     training=True,
                                                     device=device)
        s2s = nn.parallel.DistributedDataParallel(s2s, device_ids=[local_rank])
        optimizer = get_optimizer(s2s.parameters(), args)
        optimizer.load_state_dict(optimizer_state_dict)

    else:
        logging.info("New model")
        s2s = build_transformer(args, device)
        s2s.init_parameters()
        s2s = nn.parallel.DistributedDataParallel(s2s, device_ids=[local_rank])
        optimizer = get_optimizer(s2s.parameters(), args)

    s2s.train()

    if args.label_smoothing:
        logging.info("Label Smoothing!")
        criterion = LabelSmoothingLoss(args.label_smoothing, padding_value)
    else:
        criterion = nn.CrossEntropyLoss(ignore_index=padding_value)

    train_data = NMTDataset(src_data, tgt_data)

    # release cpu memory
    del src_data
    del tgt_data

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_data, num_replicas=args.world_size, rank=rank)
    train_loader = DataLoader(train_data,
                              args.batch_size,
                              shuffle=False,
                              sampler=train_sampler,
                              drop_last=True,
                              pin_memory=True,
                              collate_fn=lambda batch: collate(
                                  batch, padding_value, batch_first=True))

    for i in range(args.start_epoch, args.end_epoch):

        train_sampler.set_epoch(i)

        epoch_loss = 0.0

        start_time = time.time()

        steps = 0

        for j, (input_batch, target_batch) in enumerate(train_loader):

            if args.update_freq == 1:
                need_update = True
            else:
                need_update = True if (j +
                                       1) % args.update_freq == 0 else False

            input_batch = input_batch.to(device, non_blocking=True)
            target_batch = target_batch.to(device, non_blocking=True)

            output = s2s(input_batch, target_batch[:, :-1])
            del input_batch
            output = output.view(-1, output.size(-1))
            target_batch = target_batch[:, 1:].contiguous().view(-1)

            batch_loss = criterion(output, target_batch)
            del target_batch
            del output

            # synchronize all processes
            # Gradient synchronization communications take place during the backward pass and overlap
            # with the backward computation. When the backward() returns, param.grad already contains
            # the synchronized gradient tensor.
            dist.barrier()
            batch_loss.backward()

            if need_update:
                optimizer.step()
                optimizer.zero_grad()

            batch_loss = batch_loss.item()

            epoch_loss += batch_loss

            steps += 1

        if (steps + 1) % args.update_freq != 0:
            optimizer.step()
            optimizer.zero_grad()

        epoch_loss /= steps

        epoch_ppl = math.exp(epoch_loss)

        logging.info(
            "Epoch: {}, time: {} seconds, loss: {}, perplexity: {}, local rank: {}"
            .format(i,
                    time.time() - start_time, epoch_loss, epoch_ppl,
                    local_rank))
        if local_rank == 0:
            torch.save(save_transformer(s2s, optimizer, args),
                       "{}_{}_{}".format(args.checkpoint, i, steps))

    torch.save(save_transformer(s2s, optimizer, args),
               args.checkpoint + "_rank{}".format(local_rank))
Example #3
0
def train(args):

    setup_seed(args.seed)

    saved_model_dir, _ = os.path.split(args.checkpoint)

    if not os.path.isdir(saved_model_dir):
        os.makedirs(saved_model_dir)

    device = args.device

    torch.cuda.set_device(device)

    src_data, src_vocab = load_corpus_data(args.src_path, args.src_language,
                                           args.start_token, args.end_token,
                                           args.mask_token,
                                           args.src_vocab_path,
                                           args.rebuild_vocab, args.unk,
                                           args.threshold)

    tgt_data, tgt_vocab = load_corpus_data(args.tgt_path, args.tgt_language,
                                           args.start_token, args.end_token,
                                           args.mask_token,
                                           args.tgt_vocab_path,
                                           args.rebuild_vocab, args.unk,
                                           args.threshold)

    args.src_vocab_size = len(src_vocab)
    args.tgt_vocab_size = len(tgt_vocab)

    logging.info("Source language vocab size: {}".format(len(src_vocab)))
    logging.info("Target language vocab size: {}".format(len(tgt_vocab)))

    assert len(src_data) == len(tgt_data)

    if args.sort_sentence_by_length:
        src_data, tgt_data = sort_src_sentence_by_length(
            list(zip(src_data, tgt_data)))

    logging.info("Transformer")

    max_src_len = max(len(line) for line in src_data)
    max_tgt_len = max(len(line) for line in tgt_data)

    args.max_src_len = max_src_len
    args.max_tgt_len = max_tgt_len

    padding_value = src_vocab.get_index(args.mask_token)

    assert padding_value == tgt_vocab.get_index(args.mask_token)
    args.padding_value = padding_value

    if args.load:

        logging.info("Load existing model from {}".format(args.load))
        s2s, optimizer_state_dict = load_transformer(args,
                                                     training=True,
                                                     device=device)
        optimizer = get_optimizer(s2s.parameters(), args)
        optimizer.load_state_dict(optimizer_state_dict)

    else:
        logging.info("New model")
        s2s = build_transformer(args, device)
        s2s.init_parameters()
        optimizer = get_optimizer(s2s.parameters(), args)

    s2s.train()

    if args.label_smoothing:
        criterion = LabelSmoothingLoss(args.label_smoothing, padding_value)
    else:
        criterion = nn.CrossEntropyLoss(ignore_index=padding_value)

    train_data = NMTDataset(src_data, tgt_data)

    # release cpu memory
    del src_data
    del tgt_data

    train_loader = DataLoader(train_data,
                              args.batch_size,
                              shuffle=True,
                              pin_memory=True,
                              collate_fn=lambda batch: collate(
                                  batch, padding_value, batch_first=True))

    for i in range(args.start_epoch, args.end_epoch):

        epoch_loss = 0.0

        start_time = time.time()

        steps = 0

        for j, (input_batch, target_batch) in enumerate(train_loader):

            batch_loss = s2s.train_batch(
                input_batch.to(device, non_blocking=True),
                target_batch.to(device, non_blocking=True), criterion,
                optimizer, j, args.update_freq)

            epoch_loss += batch_loss

            steps += 1

        if (steps + 1) % args.update_freq != 0:
            optimizer.step()
            optimizer.zero_grad()

        epoch_loss /= steps

        epoch_ppl = math.exp(epoch_loss)

        torch.save(save_transformer(s2s, optimizer, args),
                   "{}_{}_{}".format(args.checkpoint, i, steps))
        logging.info(
            "Epoch: {}, time: {} seconds, loss: {}, perplexity: {}".format(
                i,
                time.time() - start_time, epoch_loss, epoch_ppl))
Example #4
0
def train(args):

    setup_seed(args.seed)

    device = torch.device(args.device)

    torch.cuda.set_device(device)

    src_data, src_vocab = load_corpus_data(args.src_path, args.src_language,
                                           args.start_token, args.end_token,
                                           args.mask_token,
                                           args.src_vocab_path,
                                           args.rebuild_vocab, args.unk,
                                           args.threshold)

    tgt_data, tgt_vocab = load_corpus_data(args.tgt_path, args.tgt_language,
                                           args.start_token, args.end_token,
                                           args.mask_token,
                                           args.tgt_vocab_path,
                                           args.rebuild_vocab, args.unk,
                                           args.threshold)

    logging.info("Source language vocab size: {}".format(len(src_vocab)))
    logging.info("Target language vocab size: {}".format(len(tgt_vocab)))

    assert len(src_data) == len(tgt_data)

    if args.sort_sentence_by_length:
        src_data, tgt_data = sort_src_sentence_by_length(
            list(zip(src_data, tgt_data)))

    if args.load:
        logging.info("Load existing model from {}".format(args.load))
        s2s, optimizer_state_dict = load_model(args.load,
                                               training=True,
                                               device=device)
        optimizer = torch.optim.Adam(s2s.parameters(), args.learning_rate)
        optimizer.load_state_dict(optimizer_state_dict)

    else:
        if args.attention_size:

            logging.info("Attention Model")
            encoder = S2S_attention.Encoder(args.rnn_type, len(src_vocab),
                                            args.embedding_size,
                                            args.hidden_size, args.num_layers,
                                            args.dropout, args.bidirectional)
            attention = S2S_attention.BahdanauAttention(
                2 *
                args.hidden_size if args.bidirectional else args.hidden_size,
                args.num_layers *
                (2 *
                 args.hidden_size if args.bidirectional else args.hidden_size),
                args.attention_size)
            decoder = S2S_attention.AttentionDecoder(
                args.rnn_type, len(tgt_vocab), args.embedding_size,
                args.embedding_size +
                (2 *
                 args.hidden_size if args.bidirectional else args.hidden_size),
                2 *
                args.hidden_size if args.bidirectional else args.hidden_size,
                args.num_layers, attention, args.dropout)
            s2s = S2S_attention.S2S(encoder, decoder).to(device)

        else:
            logging.info("Basic Model")
            encoder = S2S_basic.Encoder(args.rnn_type, len(src_vocab),
                                        args.embedding_size, args.hidden_size,
                                        args.num_layers, args.dropout,
                                        args.bidirectional)

            decoder = S2S_basic.Decoder(
                args.rnn_type, len(tgt_vocab), args.embedding_size, 2 *
                args.hidden_size if args.bidirectional else args.hidden_size,
                args.num_layers, args.dropout)

            s2s = S2S_basic.S2S(encoder, decoder).to(device)

        optimizer = torch.optim.Adam(s2s.parameters(), args.learning_rate)

    s2s.train()

    padding_value = src_vocab.get_index(args.mask_token)

    assert padding_value == tgt_vocab.get_index(args.mask_token)

    if args.label_smoothing:
        criterion = LabelSmoothingLoss(args.label_smoothing, padding_value)
    else:
        criterion = nn.CrossEntropyLoss(ignore_index=padding_value)

    train_data = NMTDataset(src_data, tgt_data)

    # release cpu memory
    del src_data
    del tgt_data

    train_loader = DataLoader(
        train_data,
        args.batch_size,
        shuffle=True,
        pin_memory=True,
        collate_fn=lambda batch: collate(batch, padding_value))

    for i in range(args.start_epoch, args.end_epoch):

        epoch_loss = 0.0

        start_time = time.time()

        steps = 0

        for j, (input_batch, target_batch) in enumerate(train_loader):

            batch_loss = s2s.train_batch(
                input_batch.to(device, non_blocking=True),
                target_batch.to(device, non_blocking=True), criterion,
                optimizer)

            epoch_loss += batch_loss

            steps += 1

        epoch_loss /= steps

        torch.save(save_model(s2s, optimizer, args),
                   "{}_{}_{}".format(args.checkpoint, i, steps))
        logging.info("Epoch: {}, time: {} seconds, loss: {}".format(
            i,
            time.time() - start_time, epoch_loss))
Example #5
0
def train(local_rank, args):

    setup_seed(args.seed)

    rank = args.nr * args.gpus + local_rank

    src_data, src_vocab = load_corpus_data(args.src_path, args.src_language,
                                           args.start_token, args.end_token,
                                           args.mask_token,
                                           args.src_vocab_path,
                                           args.rebuild_vocab, args.unk,
                                           args.threshold)

    tgt_data, tgt_vocab = load_corpus_data(args.tgt_path, args.tgt_language,
                                           args.start_token, args.end_token,
                                           args.mask_token,
                                           args.tgt_vocab_path,
                                           args.rebuild_vocab, args.unk,
                                           args.threshold)

    assert len(src_data) == len(tgt_data)

    if args.sort_sentence_by_length:
        src_data, tgt_data = sort_src_sentence_by_length(
            list(zip(src_data, tgt_data)))

    logging.info("Source language vocab size: {}".format(len(src_vocab)))
    logging.info("Target language vocab size: {}".format(len(tgt_vocab)))

    torch.distributed.init_process_group(backend="nccl",
                                         init_method=args.init_method,
                                         rank=rank,
                                         world_size=args.world_size)

    device = torch.device("cuda", local_rank)

    torch.cuda.set_device(device)

    if args.load:

        logging.info("Load existing model from {}".format(args.load))
        s2s, optimizer_state_dict = load_model(args.load,
                                               training=True,
                                               device=device)
        s2s = nn.parallel.DistributedDataParallel(s2s, device_ids=[local_rank])
        optimizer = torch.optim.Adam(s2s.parameters(), args.learning_rate)
        optimizer.load_state_dict(optimizer_state_dict)

    else:

        if args.attention_size:

            logging.info("Attention Model")
            encoder = S2S_attention.Encoder(args.rnn_type, len(src_vocab),
                                            args.embedding_size,
                                            args.hidden_size, args.num_layers,
                                            args.dropout, args.bidirectional)
            attention = S2S_attention.BahdanauAttention(
                2 *
                args.hidden_size if args.bidirectional else args.hidden_size,
                args.num_layers *
                (2 *
                 args.hidden_size if args.bidirectional else args.hidden_size),
                args.attention_size)
            decoder = S2S_attention.AttentionDecoder(
                args.rnn_type, len(tgt_vocab), args.embedding_size,
                args.embedding_size +
                (2 *
                 args.hidden_size if args.bidirectional else args.hidden_size),
                2 *
                args.hidden_size if args.bidirectional else args.hidden_size,
                args.num_layers, attention, args.dropout)
            s2s = S2S_attention.S2S(encoder, decoder).to(device)

        else:
            logging.info("Basic Model")
            encoder = S2S_basic.Encoder(args.rnn_type, len(src_vocab),
                                        args.embedding_size, args.hidden_size,
                                        args.num_layers, args.dropout,
                                        args.bidirectional)

            decoder = S2S_basic.Decoder(
                args.rnn_type, len(tgt_vocab), args.embedding_size, 2 *
                args.hidden_size if args.bidirectional else args.hidden_size,
                args.num_layers, args.dropout)

            s2s = S2S_basic.S2S(encoder, decoder).to(device)

        s2s = nn.parallel.DistributedDataParallel(s2s, device_ids=[local_rank])
        optimizer = torch.optim.Adam(s2s.parameters(), args.learning_rate)

    s2s.train()

    logging.info("Multi Gpu training: {}".format(local_rank))

    padding_value = src_vocab.get_index(args.mask_token)

    assert padding_value == tgt_vocab.get_index(args.mask_token)

    if args.label_smoothing:
        criterion = LabelSmoothingLoss(args.label_smoothing, padding_value)
    else:
        criterion = nn.CrossEntropyLoss(ignore_index=padding_value)

    train_data = NMTDataset(src_data, tgt_data)

    # release cpu memory
    del src_data
    del tgt_data

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_data, num_replicas=args.world_size, rank=rank)

    train_loader = DataLoader(
        train_data,
        args.batch_size,
        shuffle=False,
        sampler=train_sampler,
        pin_memory=True,
        collate_fn=lambda batch: collate(batch, padding_value),
        drop_last=True)

    for i in range(args.start_epoch, args.end_epoch):

        train_sampler.set_epoch(i)

        epoch_loss = 0.0

        start_time = time.time()

        steps = 0

        for j, (input_batch, target_batch) in enumerate(train_loader):

            input_batch = input_batch.to(device, non_blocking=True)
            target_batch = target_batch.to(device, non_blocking=True)

            output = s2s(input_batch, target_batch)
            del input_batch

            # output: (input_length - 1, batch_size, vocab_size)
            output = torch.stack(output, dim=0)

            # output: ((input_length - 1) * batch_size, vocab_size)
            output = output.view(-1, output.size(-1))

            # target_batch: ((input_length - 1), batch_size)
            target_batch = target_batch[1:].contiguous().view(-1)
            batch_loss = criterion(output, target_batch)
            del output
            del target_batch

            optimizer.zero_grad()
            # synchronize all processes
            dist.barrier()

            batch_loss.backward()
            optimizer.step()

            batch_loss = batch_loss.item()

            epoch_loss += batch_loss
            steps += 1

        epoch_loss /= steps

        if local_rank == 0:
            torch.save(save_model(s2s, optimizer, args),
                       "{}_{}_{}".format(args.checkpoint, i, steps))
        logging.info(
            "Epoch: {}, time: {} seconds, loss: {}, local rank: {}".format(
                i,
                time.time() - start_time, epoch_loss, local_rank))

    torch.save(save_model(s2s, optimizer, args),
               args.checkpoint + "_rank{}".format(local_rank))
Example #6
0
def main(seed, args):
    '''main function'''
    setup_seed(seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
    mode = args.mode
    dataset_name = args.dataset_name
    labelIdxStart0or1 = args.labelIdxStart0or1
    root_dir = args.root_dir
    graph_datadir = args.graph_datadir
    all_visualFea_label_file = args.all_visualFea_label_file
    auxiliary_file = args.auxiliary_file
    batch_size = args.batch_size
    weight_decay = args.weight_decay
    use_pca = args.use_pca
    reduced_dim_pca = args.reduced_dim_pca

    zsl_dataset = ZSL_Dataset(root_dir, dataset_name, mode, all_visualFea_label_file, auxiliary_file, use_pca, reduced_dim_pca)
    zsl_dataloader = data.DataLoader(zsl_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    print('data is ready!')

    vi_fea_dim = zsl_dataset.vis_fea_dim
    se_fea_dim = zsl_dataset.sem_fea_dim
    n_tr_class = zsl_dataset.n_tr_class

    te_data_unseen, te_data_seen = zsl_dataset.get_testData()
    te_vis_fea_unseen, te_sem_fea_unseen, te_label_unseen, te_labelID_unseen, te_sem_fea_pro_unseen = te_data_unseen
    te_vis_fea_seen, te_sem_fea_seen, te_label_seen, te_labelID_seen, te_sem_fea_pro_seen = te_data_seen
    tr_vis_fea, tr_sem_fea, all_tr_label, tr_labelID, tr_sem_fea_pro = zsl_dataset.get_trainData()    # for debugging
    all_labels = zsl_dataset.all_labels

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    save_subdir = dataset_name
    save_dir = os.path.join(args.save_dir, save_subdir)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    adj_matrix = load_graph(dataset_name, graph_datadir)
    weight_threshold = args.weight_threshold
    adj_lists = adjMatrix2adjLists(adj_matrix, weight_threshold)

    vi_fea_dim = zsl_dataset.vis_fea_dim
    se_fea_dim = zsl_dataset.sem_fea_dim
    use_z = args.use_z.lower()
    z_dim = args.z_dim
    if use_z == 'true':
        netG = _netG(se_fea_dim, vi_fea_dim, z_dim).to(device)
    else:
        netG = _netG2(se_fea_dim, vi_fea_dim).to(device)

    gan_checkpoint_dir = args.gan_checkpoint_dir
    gan_checkpoint_name = args.gan_checkpoint
    sem_fea_pro = zsl_dataset.all_prototype_semantic_feature
    unseen_classes = te_labelID_unseen
    n_gene_perC = args.n_gene_perC
    generated_vis_fea_dict = get_fake_unseen_visual_feat(netG, dataset_name, gan_checkpoint_dir, gan_checkpoint_name,
                                                         use_z, z_dim, sem_fea_pro, unseen_classes, n_gene_perC=n_gene_perC)
    
    for k in generated_vis_fea_dict.keys():
        gen_vis_fea_list = generated_vis_fea_dict[k]
        gen_vis_fea = np.vstack(gen_vis_fea_list)
        n_fake_instances = len(gen_vis_fea_list)
        assert gen_vis_fea.shape == (n_fake_instances, vi_fea_dim)
        tr_vis_fea = np.vstack((tr_vis_fea, gen_vis_fea))
        gen_labels = [k for _ in range(n_fake_instances)]
        gen_labels = np.array(gen_labels)
        all_tr_label = np.hstack((all_tr_label, gen_labels))

    assert len(tr_vis_fea) == len(all_tr_label)
    
    print('building dicts...')
    instanceIdx2classIdx = dict()
    classIdx2instanceIdx = defaultdict(set)
    for instanceIdx, classIdx in enumerate(all_labels):
        instanceIdx2classIdx[instanceIdx] = classIdx
        classIdx2instanceIdx[classIdx].add(instanceIdx)
    
    instanceIdx2classIdx_zsl_train = dict()
    classIdx2instanceIdx_zsl_train = defaultdict(set)
    for instanceIdx, classIdx in enumerate(all_tr_label):
        instanceIdx2classIdx_zsl_train[instanceIdx] = classIdx
        classIdx2instanceIdx_zsl_train[classIdx].add(instanceIdx)

    instanceIdx2classIdx_zsl_test_seen = dict()
    classIdx2instanceIdx_zsl_test_seen = defaultdict(set)
    for instanceIdx, classIdx in enumerate(te_label_seen):
        instanceIdx2classIdx_zsl_test_seen[instanceIdx] = classIdx
        classIdx2instanceIdx_zsl_test_seen[classIdx].add(instanceIdx)

    instanceIdx2classIdx_zsl_test_unseen = dict()
    classIdx2instanceIdx_zsl_test_unseen = defaultdict(set)
    for instanceIdx, classIdx in enumerate(te_label_unseen):
        instanceIdx2classIdx_zsl_test_unseen[instanceIdx] = classIdx
        classIdx2instanceIdx_zsl_test_unseen[classIdx].add(instanceIdx)
    print('build done!')

    # use visual feature as initial input
    firstHop_featureFunc = zsl_dataset.get_firstHop_featureFunc_visual_zsl_train()
    agg1 = MeanAggregator(firstHop_featureFunc).to(device)
    enc1 = Encoder(firstHop_featureFunc, vi_fea_dim, 128, adj_lists, agg1,
                   instanceIdx2classIdx_zsl_train, classIdx2instanceIdx_zsl_train,
                   instanceIdx2classIdx_zsl_test_seen, classIdx2instanceIdx_zsl_test_seen,
                   instanceIdx2classIdx_zsl_test_unseen, classIdx2instanceIdx_zsl_test_unseen,
                   classIdx2instanceIdx,
                   generated_vis_fea_dict,
                   mode='train', seen_labelID_set=tr_labelID,
                   gcn_style=True).to(device)
    agg2 = MeanAggregator(lambda nodes : enc1(nodes).t()).to(device)
    enc2 = Encoder(lambda nodes : enc1(nodes).t(), enc1.embed_dim, 128, adj_lists, agg2,
                   instanceIdx2classIdx_zsl_train, classIdx2instanceIdx_zsl_train,
                   instanceIdx2classIdx_zsl_test_seen, classIdx2instanceIdx_zsl_test_seen,
                   instanceIdx2classIdx_zsl_test_unseen, classIdx2instanceIdx_zsl_test_unseen,
                   classIdx2instanceIdx,
                   generated_vis_fea_dict,
                   mode='train', seen_labelID_set=tr_labelID,
                   base_model=enc1, gcn_style=True).to(device)
    enc1.num_samples = 10
    enc2.num_samples = 10
    nets = [agg1, enc1, agg2, enc2]
    n_classes = zsl_dataset.n_classes
    graphsage = SupervisedGraphSage(n_classes, enc1, enc1.embed_dim).to(device)
    nets_weights_init([graphsage])
    lr = args.lr
    optimizer = torch.optim.Adam(filter(lambda p : p.requires_grad, graphsage.parameters()), lr=lr)
    lr_maker  = lr_scheduler.StepLR(optimizer=optimizer, step_size=1000, gamma=0.9)

    print('start training...')
    best_acc_test_unseen = 0
    best_acc_test_seen = 0
    print_every = args.print_every
    eval_every = args.eval_every
    
    tr_rand_indices = np.random.permutation(len(all_tr_label))
    te_seen_indices = np.array(list(range(len(te_label_seen))))
    te_unseen_indices = np.array(list(range(len(te_label_unseen))))

    eps = np.finfo(float).eps
    for batch in range(args.n_iteration):
        graphsage.train()
        lr_maker.step()
        batch_nodes = tr_rand_indices[:batch_size]
        random.shuffle(tr_rand_indices)
        
        # "cg" for "class-level graph"
        cg_loss = graphsage.loss(batch_nodes, torch.LongTensor(all_tr_label[np.array(batch_nodes)]).to(device))
        batch_labels = all_tr_label[np.array(batch_nodes)]
        _, batch_embeddings, batch_sigma = graphsage(batch_nodes)
        batch_embeddings = torch.t(batch_embeddings)
        N = batch_embeddings.size(0)
        emb_dim = batch_embeddings.size(-1)

        support_ratio = 0.5
        n_support = int(N * support_ratio)
        n_query = N - n_support
        s_labels = batch_labels[:n_support]
        q_labels = batch_labels[n_support:]
        s_labels = torch.from_numpy(s_labels).long()
        q_labels = torch.from_numpy(q_labels).long()
        total_n_classes = zsl_dataset.total_n_classes
        s_labels_onehot = torch.zeros(n_support, total_n_classes).scatter_(1, s_labels.view(-1, 1), 1).to(device)
        q_labels_onehot = torch.zeros(n_query,   total_n_classes).scatter_(1, q_labels.view(-1, 1), 1).to(device)

        F, Fq = LP(batch_embeddings, batch_sigma, args.top_k, s_labels_onehot, total_n_classes)

        ce = nn.CrossEntropyLoss().to(device)
        
        gt = torch.argmax(torch.cat((s_labels_onehot, q_labels_onehot), 0), 1)
        meta_loss = ce(F, gt)    # dual LP: combine the loss of both support and query
        optimizer.zero_grad()
        total_loss = cg_loss + args.lambda_lploss * meta_loss
        total_loss.backward()
        optimizer.step()

        # calculate acc
        predq = torch.argmax(Fq, 1)
        gtq   = torch.argmax(q_labels_onehot, 1)
        correct = (predq == gtq).sum()
        total   = n_query
        acc = 1.0 * correct.float() / float(total)
        if batch % print_every == 0 and batch > 0:
            print('iter: {:4d}/{}  cg_loss: {:.6f}  meta_loss: {:.6f}  acc: {:.6f}  lr: {:.8f}'.format(batch, args.n_iteration, cg_loss, meta_loss, acc, optimizer.param_groups[0]['lr']))

        if batch % eval_every == 0 and batch > 0:
            graphsage.eval()

            ## test_seen
            graphsage.encoder.mode = 'test_seen'
            enc1.mode = 'test_seen'
            agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_test_seen()
            te_seen_output, test_seen_embeddings, _ = graphsage.forward(te_seen_indices)
            te_seen_output = te_seen_output.cpu()
            te_seen_acc = cls_acc(te_seen_output.data.numpy(), te_label_seen)
            cprint('te_seen_acc: {:.6f}'.format(te_seen_acc), 'yellow')
            if te_seen_acc > best_acc_test_seen:
                best_acc_test_seen = te_seen_acc

            ## test_unseen
            graphsage.encoder.mode = 'test_unseen'
            enc1.mode = 'test_unseen'
            agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_train()
            te_unseen_output, test_unseen_embeddings, _ = graphsage.forward(te_unseen_indices)
            te_unseen_output = te_unseen_output.cpu()
            te_unseen_acc = cls_acc(te_unseen_output.data.numpy(), te_label_unseen)
            cprint('te_unseen_acc: {:.6f}'.format(te_unseen_acc), 'yellow')
            if te_unseen_acc > best_acc_test_unseen:
                best_acc_test_unseen = te_unseen_acc
                save_dict = {
                    'iteration' : (batch + 1),
                    'state_dict': graphsage.state_dict(),
                    'acc_unseen': best_acc_test_unseen,
                    'acc_seen'  : te_seen_acc,
                }
                checkpoint_name = 'checkpoint_' + dataset_name + '_iter' + str(batch + 1) + '_accUnseen%.4lf_accSeen%.4lf.pkl' % (best_acc_test_unseen, te_seen_acc)
                checkpoint_path = os.path.join(save_dir, checkpoint_name)
                cprint('saving ' + checkpoint_name + ' in ' + save_dir + '...', 'green')
                torch.save(save_dict, checkpoint_path)

            if args.use_LP_eval == 'true':
                ## support
                graphsage.encoder.mode = 'train'
                enc1.mode = 'train'
                agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_train()
                
                expanded_tr_labels = all_tr_label
                n_expanded_tr = expanded_tr_labels.shape[0]
                expanded_tr_labels = torch.from_numpy(expanded_tr_labels)
                expanded_tr_nodes  = list(range(len(expanded_tr_labels)))
                expanded_tr_nodes  = np.array(expanded_tr_nodes)
                _, expanded_tr_embeddings, expanded_tr_sigma = graphsage(expanded_tr_nodes)
                expanded_tr_embeddings = torch.t(expanded_tr_embeddings)

                n_support = batch_size * 4  # the more the better where the GPU memory allows (upper bound: the number of expanded training instances)
                eval_batch_size = int(batch_size / 2)
                n_query = eval_batch_size
                shuffled_expanded_tr_idxs = np.random.permutation(n_expanded_tr)
                shuffled_idxs = shuffled_expanded_tr_idxs[:n_support]
                eval_s_labels = expanded_tr_labels[shuffled_idxs]
                eval_s_embeds = expanded_tr_embeddings[shuffled_idxs]
                s_labels_onehot = torch.zeros(n_support, total_n_classes).scatter_(1, eval_s_labels.view(-1, 1), 1).to(device)

                ## test seen
                graphsage.encoder.mode = 'test_seen'
                enc1.mode = 'test_seen'
                agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_test_seen()

                n_te_seen = len(te_label_seen)
                eval_times__te_seen = n_te_seen // n_query
                te_seen_correct = 0
                te_seen_total = 0
                test_seen_embeddings = torch.t(test_seen_embeddings)
                for k in range(eval_times__te_seen):
                    te_seen_q_embeds = test_seen_embeddings[(k * n_query) : ((k + 1) * n_query)]
                    te_seen_q_labels = te_label_seen[(k * n_query) : ((k + 1) * n_query)]
                    _embeds = torch.cat((eval_s_embeds, te_seen_q_embeds), 0)
                    _sigma = graphsage.cal_sigma(_embeds)
                    F, Fq = LP(_embeds, _sigma, args.top_k, s_labels_onehot, total_n_classes)

                    te_seen_q_labels = torch.from_numpy(te_seen_q_labels).long()
                    te_seen_q_labels_onehot = torch.zeros(n_query, total_n_classes).scatter_(1, te_seen_q_labels.view(-1, 1), 1).to(device)

                    # calculate acc
                    predq = torch.argmax(Fq, 1)
                    gtq   = torch.argmax(te_seen_q_labels_onehot, 1)
                    te_seen_correct += (predq == gtq).sum()
                    te_seen_total   += n_query

                te_seen_acc_LP = 1.0 * te_seen_correct.float() / float(te_seen_total)
                cprint('[LP] te_seen_acc: {:.6f}'.format(te_seen_acc_LP), 'yellow')


                ## test unseen
                graphsage.encoder.mode = 'test_unseen'
                enc1.mode = 'test_unseen'
                agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_train()

                n_te_unseen = len(te_label_unseen)
                eval_times__te_unseen = n_te_unseen // n_query
                te_unseen_correct = 0
                te_unseen_total = 0
                test_unseen_embeddings = torch.t(test_unseen_embeddings)
                for k in range(eval_times__te_unseen):
                    te_unseen_q_embeds = test_unseen_embeddings[(k * n_query) : ((k + 1) * n_query)]
                    te_unseen_q_labels = te_label_unseen[(k * n_query) : ((k + 1) * n_query)]
                    _embeds = torch.cat((eval_s_embeds, te_unseen_q_embeds), 0)
                    _sigma = graphsage.cal_sigma(_embeds)
                    F, Fq = LP(_embeds, _sigma, args.top_k, s_labels_onehot, total_n_classes)
                    
                    te_unseen_q_labels = torch.from_numpy(te_unseen_q_labels).long()
                    te_unseen_q_labels_onehot = torch.zeros(n_query, total_n_classes).scatter_(1, te_unseen_q_labels.view(-1, 1), 1).to(device)

                    # calculate acc
                    predq = torch.argmax(Fq, 1)
                    gtq   = torch.argmax(te_unseen_q_labels_onehot, 1)
                    te_unseen_correct += (predq == gtq).sum()
                    te_unseen_total   += n_query

                te_unseen_acc_LP = 1.0 * te_unseen_correct.float() / float(te_unseen_total)
                cprint('[LP] te_unseen_acc: {:.6f}'.format(te_unseen_acc_LP), 'red')

            # recover to the training mode
            graphsage.encoder.mode = 'train'
            enc1.mode = 'train'
            agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_train()

    print('Final:' + '%' * 50)
    graphsage.encoder.mode = 'test_seen'
    enc1.mode = 'test_seen'
    agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_test_seen()
    te_seen_output, embeddings, _ = graphsage.forward(te_seen_indices)
    te_seen_output = te_seen_output.cpu()
    te_seen_acc = cls_acc(te_seen_output.data.numpy(), te_label_seen)
    print('test_seen_acc: {:.6f}'.format(te_seen_acc))
    if te_seen_acc > best_acc_test_seen:
        best_acc_test_seen = te_seen_acc
    print('best acc of test_seen data: {:.6f}'.format(best_acc_test_seen))

    graphsage.encoder.mode = 'test_unseen'
    enc1.mode = 'test_unseen'
    agg1.features_func = zsl_dataset.get_firstHop_featureFunc_visual_zsl_train()
    te_unseen_output, embeddings, _ = graphsage.forward(te_unseen_indices)
    te_unseen_output = te_unseen_output.cpu()
    te_unseen_acc = cls_acc(te_unseen_output.data.numpy(), te_label_unseen)
    print('test_UNseen_acc: {:.6f}'.format(te_unseen_acc))
    if te_unseen_acc > best_acc_test_unseen:
        best_acc_test_unseen = te_unseen_acc
    print('best acc of test_UNseen data: {:.6f}'.format(best_acc_test_unseen))
    print('%' * 56)