Ejemplo n.º 1
0
                model=copy.deepcopy(global_model), global_round=epoch)
            local_weights.append(copy.deepcopy(w))
            local_losses.append(copy.deepcopy(loss))

        # update global weights
        global_weights = average_weights(local_weights)

        # update global weights
        global_model.load_state_dict(global_weights)

        loss_avg = sum(local_losses) / len(local_losses)
        train_loss.append(loss_avg)

        # Calculate avg training accuracy over all users at every epoch
        list_acc, list_loss = [], []
        global_model.eval()
        for c in range(args.num_users):
            local_model = LocalUpdate(args=args,
                                      dataset=train_dataset,
                                      idxs=user_groups[idx],
                                      logger=logger)
            acc, loss = local_model.inference(model=global_model)
            list_acc.append(acc)
            list_loss.append(loss)
        train_accuracy.append(sum(list_acc) / len(list_acc))

        # print global training loss after every 'i' rounds
        if (epoch + 1) % print_every == 0:
            print(f' \nAvg Training Stats after {epoch+1} global rounds:')
            print(f'Training Loss : {np.mean(np.array(train_loss))}')
            print('Train Accuracy: {:.2f}% \n'.format(100 *
Ejemplo n.º 2
0
def main():
    start_time = time.time()

    # define paths
    path_project = os.path.abspath('..')
    logger = SummaryWriter('../logs')

    args = args_parser()
    exp_details(args)

    if args.gpu:
        torch.cuda.set_device(0)
    device = 'cuda' if args.gpu else 'cpu'

    # load dataset and user groups
    train_dataset, test_dataset, user_groups = get_dataset(args)

    args.num_users = len(user_groups)

    # BUILD MODEL
    if args.model == 'cnn':
        # Convolutional neural netork
        if args.dataset == 'mnist':
            global_model = CNNMnist(args=args)
        elif args.dataset == 'fmnist':
            global_model = CNNFashion_Mnist(args=args)
        elif args.dataset == 'cifar':
            global_model = CNNCifar(args=args)

    elif args.model == 'mlp':
        # Multi-layer preceptron
        img_size = train_dataset[0][0].shape
        len_in = 1
        for x in img_size:
            len_in *= x
            global_model = MLP(dim_in=len_in,
                               dim_hidden=64,
                               dim_out=args.num_classes)
    else:
        exit('Error: unrecognized model')

    # Set the model to train and send it to device.
    global_model.to(device)
    global_model.train()

    # copy weights
    global_weights = global_model.state_dict()

    # Training
    train_loss, train_accuracy = [], []
    val_acc_list, net_list = [], []
    cv_loss, cv_acc = [], []
    print_every = 2
    val_loss_pre, counter = 0, 0

    #Beolvassuk, hogy éppen mely résztvevők vesznek részt a tanításban (0 jelentése, hogy benne van, 1 az hogy nincs)
    users = []
    fp = open('users.txt', "r")
    x = fp.readline().split(' ')
    for i in x:
        if i != '':
            users.append(int(i))
    fp.close()

    #for epoch in tqdm(range(args.epochs)):
    for epoch in range(args.epochs):
        local_weights, local_losses = [], []
        #print(f'\n | Global Training Round : {epoch+1} |\n')

        global_model.train()
        m = max(int(args.frac * args.num_users), 1)
        idxs_users = np.random.choice(range(args.num_users), m, replace=False)

        for idx in idxs_users:
            local_model = LocalUpdate(args=args,
                                      dataset=train_dataset,
                                      idxs=user_groups[idx],
                                      logger=logger)
            w, loss = local_model.update_weights(
                model=copy.deepcopy(global_model), global_round=epoch)
            local_weights.append(copy.deepcopy(w))
            local_losses.append(copy.deepcopy(loss))

        global_weights = average_weights(local_weights)

        # update global weights
        global_model.load_state_dict(global_weights)

        loss_avg = sum(local_losses) / len(local_losses)
        train_loss.append(loss_avg)

        # Calculate avg training accuracy over all users at every epoch
        list_acc, list_loss = [], []
        global_model.eval()
        for c in range(args.num_users):
            local_model = LocalUpdate(args=args,
                                      dataset=train_dataset,
                                      idxs=user_groups[idx],
                                      logger=logger)
            acc, loss = local_model.inference(model=global_model)
            list_acc.append(acc)
            list_loss.append(loss)
        train_accuracy.append(sum(list_acc) / len(list_acc))

        # print global training loss after every 'i' rounds
        '''if (epoch+1) % print_every == 0:
            print(f' \nAvg Training Stats after {epoch+1} global rounds:')
            print(f'Training Loss : {np.mean(np.array(train_loss))}')
            print('Train Accuracy: {:.2f}% \n'.format(100*train_accuracy[-1]))'''

    # Test inference after completion of training

    #Beolvassuk hogy mely résztvevőnek mely labeleket osztottuk ki.
    ftrain = open('traindataset.txt')
    testlabels = []
    line = ftrain.readline()
    while line != "":
        sor = line.split(' ')
        array = []
        for i in sor:
            array.append(int(i))
        testlabels.append(array)
        line = ftrain.readline()
    ftrain.close()

    print("USERS LABELS")
    print(testlabels)

    #Minden lehetséges koalícióra lefut a tesztelés
    for j in range((2**args.num_users) - 1):
        binary = numberToBinary(j, len(users))

        test_acc, test_loss = test_inference(args, global_model, test_dataset,
                                             testlabels, binary, len(binary))

        #Teszt eredmények kiírása
        print("RESZTVEVOK")
        print(users)
        print("TEST NUMBER")
        print(j)
        print("TEST BINARY")
        print(binary)
        print("TEST LABELS")
        print(testlabels)
        print("Test Accuracy")
        print("{:.2f}%".format(100 * test_acc))
        print()

    # Saving the objects train_loss and train_accuracy:
    '''file_name = '../save/objects/{}_{}_{}_C[{}]_iid[{}]_E[{}]_B[{}].pkl'.\
Ejemplo n.º 3
0
def main_test(args):
    start_time = time.time()
    now = datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')
    # define paths

    logger = SummaryWriter('../logs')

    # easydict 사용하는 경우 주석처리
    # args = args_parser()

    # checkpoint 생성위치
    args.save_path = os.path.join(args.save_path, args.exp_folder)
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    save_path_tmp = os.path.join(args.save_path, 'tmp_{}'.format(now))
    if not os.path.exists(save_path_tmp):
        os.makedirs(save_path_tmp)
    SAVE_PATH = os.path.join(args.save_path, '{}_{}_T[{}]_C[{}]_iid[{}]_E[{}]_B[{}]'.
                             format(args.dataset, args.model, args.epochs, args.frac, args.iid,
                                    args.local_ep, args.local_bs))

    # 시드 고정
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)



#    torch.cuda.set_device(0)
    device = torch.device("cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu")
    cpu_device = torch.device('cpu')
    # log 파일 생성
    log_path = os.path.join('../logs', args.exp_folder)
    if not os.path.exists(log_path):
        os.makedirs(log_path)

    loggertxt = get_logger(
        os.path.join(log_path, '{}_{}_{}_{}.log'.format(args.model, args.optimizer, args.norm, now)))
    logging.info(args)
    # csv
    csv_save = '../csv/' + now
    csv_path = os.path.join(csv_save, 'accuracy.csv')
    csv_logger_keys = ['train_loss', 'accuracy']
    csvlogger = CSVLogger(csv_path, csv_logger_keys)

    # load dataset and user groups
    train_dataset, test_dataset, client_loader_dict = get_dataset(args)

    # cifar-100의 경우 자동 설정
    if args.dataset == 'cifar100':
        args.num_classes = 100
    # BUILD MODEL
    if args.model == 'cnn':
        # Convolutional neural network
        if args.dataset == 'mnist':
            global_model = CNNMnist(args=args)
        elif args.dataset == 'fmnist':
            global_model = CNNFashion_Mnist(args=args)
        elif args.dataset == 'cifar':
            global_model = CNNCifar(args=args)
        elif args.dataset == 'cifar100':
            global_model = CNNCifar(args=args)

    elif args.model == 'mlp':
        # Multi-layer preceptron
        img_size = train_dataset[0][0].shape
        len_in = 1
        for x in img_size:
            len_in *= x
            global_model = MLP(dim_in=len_in, dim_hidden=64,
                               dim_out=args.num_classes)
    elif args.model == 'cnn_vc':
        global_model = CNNCifar_fedVC(args=args)
    elif args.model == 'cnn_vcbn':
        global_model = CNNCifar_VCBN(args=args)
    elif args.model == 'cnn_vcgn':
        global_model = CNNCifar_VCGN(args=args)
    elif args.model == 'resnet18_ws':
        global_model = resnet18(num_classes=args.num_classes, weight_stand=1)
    elif args.model == 'resnet18':
        global_model = resnet18(num_classes=args.num_classes, weight_stand=0)
    elif args.model == 'resnet32':
        global_model = ResNet32_test(num_classes=args.num_classes)
    elif args.model == 'resnet18_mabn':
        global_model = resnet18_mabn(num_classes=args.num_classes)
    elif args.model == 'vgg':
        global_model = vgg11()
    elif args.model == 'cnn_ws':
        global_model = CNNCifar_WS(args=args)


    else:
        exit('Error: unrecognized model')

    # Set the model to train and send it to device.
    loggertxt.info(global_model)
    # fedBN처럼 gn no communication 용
    client_models = [copy.deepcopy(global_model) for idx in range(args.num_users)]

    # copy weights
    global_weights = global_model.state_dict()

    global_model.to(device)
    global_model.train()

    # Training
    train_loss, train_accuracy = [], []
    val_acc_list, net_list = [], []


    # how does help BN 확인용
    client_loss = [[] for i in range(args.num_users)]
    client_conv_grad = [[] for i in range(args.num_users)]
    client_fc_grad = [[] for i in range(args.num_users)]
    client_total_grad_norm = [[] for i in range(args.num_users)]
    # 전체 loss 추적용 -how does help BN

    # 재시작
    if args.resume:
        checkpoint = torch.load(SAVE_PATH)
        global_model.load_state_dict(checkpoint['global_model'])
        if args.hold_normalize:
            for client_idx in range(args.num_users):
                client_models[client_idx].load_state_dict(checkpoint['model_{}'.format(client_idx)])
        else:
            for client_idx in range(args.num_users):
                client_models[client_idx].load_state_dict(checkpoint['global_model'])
        resume_iter = int(checkpoint['a_iter']) + 1
        print('Resume trainig form epoch {}'.format(resume_iter))
    else:
        resume_iter = 0


    # learning rate scheduler
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, gamma=0.1,step_size=500)

    # start training
    for epoch in tqdm(range(args.epochs)):
        local_weights, local_losses = [], []
        if args.verbose:
            print(f'\n | Global Training Round : {epoch + 1} |\n')

        global_model.train()
        m = max(int(args.frac * args.num_users), 1)
        idxs_users = np.random.choice(range(args.num_users), m, replace=False)


        for idx in idxs_users:
            """
            for key in global_model.state_dict().keys():
                if args.hold_normalize:
                    if 'bn' not in key:
                        client_models[idx].state_dict()[key].data.copy_(global_model.state_dict()[key])
                else:
                    client_models[idx].state_dict()[key].data.copy_(global_model.state_dict()[key])
            """
            torch.cuda.empty_cache()


            local_model = LocalUpdate(args=args, logger=logger, train_loader=client_loader_dict[idx], device=device)
            w, loss, batch_loss, conv_grad, fc_grad, total_gard_norm = local_model.update_weights(
                model=copy.deepcopy(global_model), global_round=epoch, idx_user=idx)
            local_weights.append(copy.deepcopy(w))
            # client의 1 epoch에서의 평균 loss값  ex)0.35(즉, batch loss들의 평균)
            local_losses.append(copy.deepcopy(loss))

            # 전체 round scheduler
          #  scheduler.step()
            # loss graph용 -> client당 loss값 진행 저장 -> 모두 client별로 저장.
            client_loss[idx].append(batch_loss)
            client_conv_grad[idx].append(conv_grad)
            client_fc_grad[idx].append(fc_grad)
            client_total_grad_norm[idx].append(total_gard_norm)

            # print(total_gard_norm)
            # gn, bn 복사
            # client_models[idx].load_state_dict(w)
            del local_model
            del w
        # update global weights
        global_weights = average_weights(local_weights, client_loader_dict, idxs_users)
        # update global weights
#        opt = OptRepo.name2cls('adam')(global_model.parameters(), lr=0.01, betas=(0.9, 0.99), eps=1e-3)
        opt = OptRepo.name2cls('sgd')(global_model.parameters(), lr=10, momentum=0.9)
        opt.zero_grad()
        opt_state = opt.state_dict()
        global_weights = aggregation(global_weights, global_model)
        global_model.load_state_dict(global_weights)
        opt = OptRepo.name2cls('sgd')(global_model.parameters(), lr=10, momentum=0.9)
#        opt = OptRepo.name2cls('adam')(global_model.parameters(), lr=0.01, betas=(0.9, 0.99), eps=1e-3)
        opt.load_state_dict(opt_state)
        opt.step()
        loss_avg = sum(local_losses) / len(local_losses)
        train_loss.append(loss_avg)

        global_model.eval()
        #        for c in range(args.num_users):
        #            local_model = LocalUpdate(args=args, dataset=train_dataset,
        #                                      idxs=user_groups[idx], logger=logger)
        #            acc, loss = local_model.inference(model=global_model)
        #            list_acc.append(acc)
        #            list_loss.append(loss)
        #        train_accuracy.append(sum(list_acc)/len(list_acc))
        train_accuracy = test_inference(args, global_model, test_dataset, device=device)
        val_acc_list.append(train_accuracy)
        # print global training loss after every 'i' rounds
        # if (epoch+1) % print_every == 0:
        loggertxt.info(f' \nAvg Training Stats after {epoch + 1} global rounds:')
        loggertxt.info(f'Training Loss : {loss_avg}')
        loggertxt.info('Train Accuracy: {:.2f}% \n'.format(100 * train_accuracy))
        csvlogger.write_row([loss_avg, 100 * train_accuracy])
        if (epoch + 1) % 100 == 0:
            tmp_save_path = os.path.join(save_path_tmp, 'tmp_{}.pt'.format(epoch+1))
            torch.save(global_model.state_dict(),tmp_save_path)
    # Test inference after completion of training
    test_acc = test_inference(args, global_model, test_dataset, device=device)

    print(' Saving checkpoints to {}...'.format(SAVE_PATH))
    if args.hold_normalize:
        client_dict = {}
        for idx, model in enumerate(client_models):
            client_dict['model_{}'.format(idx)] = model.state_dict()
        torch.save(client_dict, SAVE_PATH)
    else:
        torch.save({'global_model': global_model.state_dict()}, SAVE_PATH)

    loggertxt.info(f' \n Results after {args.epochs} global rounds of training:')
    # loggertxt.info("|---- Avg Train Accuracy: {:.2f}%".format(100*train_accuracy[-1]))
    loggertxt.info("|---- Test Accuracy: {:.2f}%".format(100 * test_acc))


    # frac이 1이 아닐경우 잘 작동하지않음.
    # batch_loss_list = np.array(client_loss).sum(axis=0) / args.num_users

    # conv_grad_list = np.array(client_conv_grad).sum(axis=0) / args.num_users
    # fc_grad_list = np.array(client_fc_grad).sum(axis=0) / args.num_users
    # total_grad_list = np.array(client_total_grad_norm).sum(axis=0) /args.num_users
    # client의 avg를 구하고 싶었으나 현재는 client 0만 확인
    # client마다 batch가 다를 경우 bug 예상
    return train_loss, val_acc_list, client_loss[0], client_conv_grad[0], client_fc_grad[0], client_total_grad_norm[0]
Ejemplo n.º 4
0
def main():
    start_time = time.time()

    # define paths
    path_project = os.path.abspath('..')
    logger = SummaryWriter('../logs')
    args = args_parser()
    args = adatok.arguments(args)
    exp_details(args)
    if args.gpu:
        torch.cuda.set_device(args.gpu)
    device = 'cuda' if args.gpu else 'cpu'

    # load dataset and user groups
    train_dataset, test_dataset, user_groups = get_dataset(args)

    if adatok.data.image_initialization == True:
        adatok.data.image_initialization = False
        return

    # BUILD MODEL
    if args.model == 'cnn':
        # Convolutional neural netork
        if args.dataset == 'mnist':
            global_model = CNNMnist(args=args)
        elif args.dataset == 'fmnist':
            global_model = CNNFashion_Mnist(args=args)
        elif args.dataset == 'cifar':
            global_model = CNNCifar(args=args)

    elif args.model == 'mlp':
        # Multi-layer preceptron
        img_size = train_dataset[0][0].shape
        len_in = 1
        for x in img_size:
            len_in *= x
            global_model = MLP(dim_in=len_in,
                               dim_hidden=64,
                               dim_out=args.num_classes)
    else:
        exit('Error: unrecognized model')

    # Set the model to train and send it to device.
    global_model.to(device)
    global_model.train()
    #print(global_model)

    # copy weights
    global_weights = global_model.state_dict()

    # Training
    train_loss, train_accuracy = [], []
    val_acc_list, net_list = [], []
    cv_loss, cv_acc = [], []
    print_every = 2
    val_loss_pre, counter = 0, 0

    for epoch in tqdm(range(args.epochs)):
        local_weights, local_losses = [], []
        #print(f'\n | Global Training Round : {epoch+1} |\n')

        global_model.train()
        m = max(int(args.frac * args.num_users), 1)
        idxs_users = np.random.choice(range(args.num_users), m, replace=False)

        for idx in idxs_users:
            local_model = LocalUpdate(args=args,
                                      dataset=train_dataset,
                                      idxs=user_groups[idx],
                                      logger=logger)
            w, loss = local_model.update_weights(
                model=copy.deepcopy(global_model), global_round=epoch)
            local_weights.append(copy.deepcopy(w))
            local_losses.append(copy.deepcopy(loss))

        # update global weights
        global_weights = average_weights(local_weights)

        # update global weights
        global_model.load_state_dict(global_weights)

        loss_avg = sum(local_losses) / len(local_losses)
        train_loss.append(loss_avg)

        # Calculate avg training accuracy over all users at every epoch
        list_acc, list_loss = [], []
        global_model.eval()
        for c in range(args.num_users):
            local_model = LocalUpdate(args=args,
                                      dataset=train_dataset,
                                      idxs=user_groups[idx],
                                      logger=logger)
            acc, loss = local_model.inference(model=global_model)
            list_acc.append(acc)
            list_loss.append(loss)
        train_accuracy.append(sum(list_acc) / len(list_acc))

        # print global training loss after every 'i' rounds
        '''if (epoch+1) % print_every == 0:
            print(f' \nAvg Training Stats after {epoch+1} global rounds:')
            print(f'Training Loss : {np.mean(np.array(train_loss))}')
            print('Train Accuracy: {:.2f}% \n'.format(100*train_accuracy[-1]))'''

        # Test inference after completion of training
        for i in adatok.data.test_groups_in_binary:
            adatok.data.actual_test_group_in_binary = i
            test_acc, test_loss = test_inference(args, global_model,
                                                 test_dataset)
            print("Resoults")
            print(epoch)
            print(adatok.data.actual_train_group_in_binary)
            print(adatok.data.actual_test_group_in_binary)
            print(test_acc)
            print(test_loss)
    '''