Ejemplo n.º 1
0
def run(workers, models, save_path, train_data_list, test_data,
        iterations_epoch):
    workers_num = len(workers)
    print('Model recved successfully!')
    optimizers_list = []
    if args.lr == 0.0:
        if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
            learning_rate = 0.1
        else:
            learning_rate = 0.01
    else:
        learning_rate = args.lr

    for i in workers:
        optimizer = MySGD(models[i].parameters(), lr=learning_rate)
        optimizers_list.append(optimizer)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 50
    else:
        decay_period = 1000

    print('Begin!')

    # store (train loss, energy, iterations)
    trainloss_file = './trainloss' + args.model + '.txt'
    if (os.path.isfile(trainloss_file)):
        os.remove(trainloss_file)
    f_trainloss = open(trainloss_file, 'a')

    log_file = args.model + 'log.txt'
    if (os.path.isfile(log_file)):
        os.remove(log_file)
    f_log = open(log_file, 'a')

    train_data_iter_list = []
    for i in workers:
        train_data_iter_list.append(iter(train_data_list[i - 1]))

    epoch_train_loss = 0.0
    total_time = 0.0
    total_pulling_ratio = 0.0
    epoch_avg_pull_ratio = 0.0

    clock_epoch = 0
    test_loss = 0
    test_acc = 0
    for iteration in range(args.epochs * iterations_epoch):
        clock_epoch += 1
        iteration_loss = 0.0
        epoch = int((iteration + 1) / iterations_epoch)
        for i in workers:
            models[i].train()

        g_list = []
        for i in workers:
            try:
                data, target = next(train_data_iter_list[i - 1])
            except StopIteration:
                train_data_iter_list[i - 1] = iter(train_data_list[i - 1])
                data, target = next(train_data_iter_list[i - 1])
            data, target = Variable(data), Variable(target)
            optimizers_list[i - 1].zero_grad()
            output = models[i](data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizers_list[i - 1].get_delta_w()
            g_list.append(delta_ws)
            iteration_loss += loss.data.item() / workers_num
        epoch_train_loss += iteration_loss

        g_q_list = []
        for g in g_list:
            g_quantization, compression_ratio = quantization(g, args.bit)
            g_q_list.append(g_quantization)

        # 同步操作
        g_avg = []
        for p_idx, param in enumerate(models[0].parameters()):
            global_update_layer = torch.zeros_like(param.data)
            for w in workers:
                global_update_layer += g_q_list[w - 1][p_idx]
            tensor = global_update_layer / workers_num
            g_avg.append(tensor)
            param.data -= tensor
        pull_workers = 0
        pull_workers_list = pull_judge(workers_num, args.ratio)
        for w in workers:
            isPulling = w in pull_workers_list
            if isPulling:
                pull_workers += 1
            for p_idx, param in enumerate(models[0].parameters()):
                if isPulling:
                    list(models[w].parameters())[p_idx].data = param.data
                else:
                    list(models[w].parameters())[p_idx].data -= g_q_list[
                        w - 1][p_idx]

        print('Epoch {}, Loss:{}'.format(epoch, loss.data.item()))
        total_pulling_ratio += pull_workers / workers_num
        epoch_avg_pull_ratio += pull_workers / workers_num
        f_log.write(
            str(args.this_rank) + "\t" + str(iteration_loss) + "\t" +
            str(epoch) + "\t" +
            str(pull_workers / workers_num) +  # the ratio of pulling workers
            "\t" + str(iteration) + "\t" + str(pull_workers_list) + '\n')
        f_log.flush()

        # train loss every epoch
        if iteration % iterations_epoch == 0:
            # 训练结束后进行test
            if iteration % (2 * iterations_epoch) == 0:
                test_loss, test_acc = test_model(0,
                                                 model,
                                                 test_data,
                                                 criterion=criterion)
            f_trainloss.write(
                str(args.this_rank) + "\t" +
                str(epoch_train_loss / float(clock_epoch)) + "\t" +
                str(test_loss) + "\t" + str(test_acc) + "\t" +
                str(total_pulling_ratio)
                +  # accumulated pulling ratio of workers
                "\t" + str(epoch) + "\t" +
                str(epoch_avg_pull_ratio / clock_epoch)
                +  # the avg ratio of pulling workers in an epoch
                "\t" + str(iteration) + "\t" + str(total_time) +  # time
                '\n')
            f_trainloss.flush()
            epoch_train_loss = 0.0
            epoch_avg_pull_ratio = 0.0
            clock_epoch = 0
            for i in workers:
                if (epoch + 1) % decay_period == 0:
                    for param_group in optimizers_list[i - 1].param_groups:
                        param_group['lr'] *= 0.1
                        print('LR Decreased! Now: {}'.format(
                            param_group['lr']))

    f_log.close()
    f_trainloss.close()
Ejemplo n.º 2
0
def run(workers, models, save_path, train_data_list, test_data,
        iterations_epoch):
    workers_num = len(workers)
    print('Model recved successfully!')
    optimizers_list = []
    for i in workers:
        if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
            optimizer = MySGD(models[i].parameters(), lr=0.1)
        else:
            optimizer = MySGD(models[i].parameters(), lr=0.01)
        optimizers_list.append(optimizer)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 500
    else:
        decay_period = 1000000

    print('Begin!')

    global_g = [torch.zeros_like(param.data) for param in model.parameters()]

    # store (train loss, energy, iterations)
    trainloss_file = './trainloss' + args.model + '_' + args.file_name + '_ec.txt'  # .txt file name
    if (os.path.isfile(trainloss_file)):
        os.remove(trainloss_file)
    f_trainloss = open(trainloss_file, 'a')

    train_data_iter_list = []
    for i in workers:
        train_data_iter_list.append(iter(train_data_list[i - 1]))

    epoch_train_loss = 0.0
    global_clock = 0
    g_remain_list = []
    ratio = args.ratio
    threshold = 0.

    # compensation
    h_last_list = []  # h_t
    h_remain_list = []  # h_t - 1
    alpha = args.alpha
    beta = args.beta
    print(alpha, " and ", beta)
    for iteration in range(args.epochs * iterations_epoch):
        iteration_loss = 0.0

        g_list = []
        g_change_average = [
            torch.zeros_like(param.data) for param in models[0].parameters()
        ]
        global_clock += 1
        for i in workers:
            try:
                data, target = next(train_data_iter_list[i - 1])
            except StopIteration:
                train_data_iter_list[i - 1] = iter(train_data_list[i - 1])
                data, target = next(train_data_iter_list[i - 1])
            data, target = Variable(data), Variable(target)
            optimizers_list[i - 1].zero_grad()
            output = models[i](data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizers_list[i - 1].get_delta_w()
            g_list.append(delta_ws)
            iteration_loss += loss.data.item() / workers_num

            if global_clock == 1:
                g_remain = [
                    torch.zeros_like(g_layer) + g_layer for g_layer in delta_ws
                ]
                g_remain_list.append(g_remain)

                h_remain = [torch.zeros_like(g_layer) for g_layer in delta_ws]
                h_remain_list.append(h_remain)

                h_last = [torch.zeros_like(g_layer) for g_layer in delta_ws]
                h_last_list.append(h_last)

                test_loss = str(
                    0)  ######################################################
                test_acc = str(
                    0)  ######################################################
                # synchronous update
                # the gradient change in the first iteration is gradient itself
                for g_change_layer_idx, g_change_layer in enumerate(
                        g_change_average):
                    g_change_layer.data += delta_ws[
                        g_change_layer_idx].data / workers_num
                sparsification_ratio = 1.0
            else:

                new_delta_ws = [
                    torch.zeros_like(g_layer) + g_layer for g_layer in delta_ws
                ]
                # new_delta_ws = optimizers_list[i-1].get_delta_w()
                for idx, g_layer in enumerate(delta_ws):
                    # print(new_delta_ws[idx], " and ", alpha * (h_last_list[i-1][idx] - h_remain_list[i-1][idx]))
                    new_delta_ws[idx] += alpha * (h_last_list[i - 1][idx] -
                                                  h_remain_list[i - 1][idx])
                print(ratio)
                g_remain, g_large_change, sparsification_ratio = get_upload(
                    g_remain_list[i - 1], new_delta_ws, ratio,
                    args.isCompensate, threshold)
                g_remain_list[i - 1] = g_remain
                # synchronous update
                for g_change_layer_idx, g_change_layer in enumerate(
                        g_change_average):
                    g_change_layer.data += g_large_change[
                        g_change_layer_idx].data / workers_num

                # update h
                h_last = [torch.zeros_like(g_layer) for g_layer in delta_ws]
                h_remain = h_last_list[i - 1]
                for idx, g_layer in enumerate(delta_ws):
                    h_last[idx] = h_remain[idx] * beta
                    if args.add == 1:
                        h_last[idx] += (delta_ws[idx] - g_remain[idx])
                    else:
                        h_last[idx] -= (delta_ws[idx] - g_remain[idx])
                h_remain_list[i - 1] = h_remain
                h_last_list[i - 1] = h_last

        # 同步操作
        g_quare_sum = 0.0  # for threshold
        for p_idx, param in enumerate(models[0].parameters()):
            global_g[p_idx].data += g_change_average[p_idx].data
            param.data -= global_g[p_idx].data
            for w in workers:
                list(models[w].parameters()
                     )[p_idx].data = param.data + torch.zeros_like(param.data)

            g_quare_sum += torch.sum(global_g[p_idx].data *
                                     global_g[p_idx].data)

        g_quare_sum = torch.sqrt(g_quare_sum)
        threshold = g_quare_sum.data.item()

        epoch_train_loss += iteration_loss
        epoch = int(iteration / iterations_epoch)
        # print('Epoch {}, Loss:{}'.format(epoch, loss.data.item()))
        # if (iteration+1) % iterations_epoch == 0:
        if True:
            #
            #'''
            if (iteration + 1) % iterations_epoch == 0:
                # 训练结束后进行test
                test_loss, test_acc = test_model(0,
                                                 model,
                                                 test_data,
                                                 criterion=criterion)
                epoch_train_loss = 0.0
            #'''
            f_trainloss.write(
                str(args.this_rank) + "\t" +
                str(epoch_train_loss / float(iterations_epoch)) + "\t" +
                str(iteration_loss) + "\t" + str(0) + "\t" + str(epoch) +
                "\t" + str(0) + "\t" + str(iteration) + "\t" +
                str(sparsification_ratio) +  # time
                "\t" + str(global_clock) +  # time
                "\t" + str(test_loss) +  # test_loss
                "\t" + str(test_acc) +  # test_acc
                '\n')
            f_trainloss.flush()

            # 在指定epochs (iterations) 减少缩放因子
            if (epoch + 1) in [0, 1000]:
                ratio = ratio * 0.1
                print('--------------------------------')
                print(ratio)

            for i in workers:
                models[i].train()
                if (epoch + 1) % decay_period == 0:
                    for param_group in optimizers_list[i - 1].param_groups:
                        param_group['lr'] *= 0.1
                        print('LR Decreased! Now: {}'.format(
                            param_group['lr']))

    f_trainloss.close()
def run(workers, models, save_path, train_data_list, test_data,
        iterations_epoch):
    workers_num = len(workers)
    print('Model recved successfully!')
    optimizers_list = []
    for i in workers:
        if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
            optimizer = MySGD(models[i].parameters(), lr=0.1)
        else:
            optimizer = MySGD(models[i].parameters(), lr=0.01)
        optimizers_list.append(optimizer)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 500
    else:
        decay_period = 1000000

    print('Begin!')

    global_g = [torch.zeros_like(param.data) for param in model.parameters()]

    # store (train loss, energy, iterations)
    trainloss_file = './trainloss_oldsimu' + args.model + '_w15r1lr0.1.txt'
    if (os.path.isfile(trainloss_file)):
        os.remove(trainloss_file)
    f_trainloss = open(trainloss_file, 'a')

    train_data_iter_list = []
    for i in workers:
        train_data_iter_list.append(iter(train_data_list[i - 1]))

    epoch_train_loss = 0.0
    global_clock = 0
    g_remain_list = []
    ratio = args.ratio
    threshold = 0.
    print("Begin for")
    for iteration in range(args.epochs * iterations_epoch):
        iteration_loss = 0.0

        g_list = []
        g_change_average = [
            torch.zeros_like(param.data) for param in models[0].parameters()
        ]
        global_clock += 1
        for i in workers:  # zheli
            try:
                data, target = next(train_data_iter_list[i - 1])
            except StopIteration:
                train_data_iter_list[i - 1] = iter(train_data_list[i - 1])
                data, target = next(train_data_iter_list[i - 1])
            data, target = Variable(data), Variable(target)
            optimizers_list[i - 1].zero_grad()
            output = models[i](data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizers_list[i - 1].get_delta_w()  # zheli gzhi
            g_list.append(delta_ws)
            iteration_loss += loss.data.item() / workers_num

            if global_clock == 1:  # chushihua diyilun
                g_remain = [
                    torch.zeros_like(g_layer) + g_layer for g_layer in delta_ws
                ]
                g_remain_list.append(g_remain)

                test_loss = str(
                    0)  ######################################################
                test_acc = str(
                    0)  ######################################################

                # synchronous update
                # the gradient change in the first iteration is gradient itself
                for g_change_layer_idx, g_change_layer in enumerate(
                        g_change_average):
                    g_change_layer.data += delta_ws[
                        g_change_layer_idx].data / workers_num
                sparsification_ratio = 1.0
            else:
                # print(delta_ws)        # 2 huge?
                g_remain, g_large_change, sparsification_ratio = get_upload(
                    g_remain_list[i - 1], delta_ws, ratio, args.isCompensate,
                    threshold)  #hanshu
                g_remain_list[i - 1] = g_remain  # server g gengxin
                # synchronous update
                for g_change_layer_idx, g_change_layer in enumerate(
                        g_change_average):  # line5
                    g_change_layer.data += g_large_change[
                        g_change_layer_idx].data / workers_num  #line5

        # 同步操作
        g_quare_sum = 0.0  # for threshold     server
        for p_idx, param in enumerate(models[0].parameters()):
            global_g[p_idx].data += g_change_average[
                p_idx].data  #zheli   delta pingjun
            param.data -= global_g[p_idx].data
            for w in workers:  #gengxin w
                list(models[w].
                     parameters())[p_idx].data = param.data + torch.zeros_like(
                         param.data)  #hui chuan

            g_quare_sum += torch.sum(global_g[p_idx].data *
                                     global_g[p_idx].data)  #server    buxishuo

        g_quare_sum = torch.sqrt(g_quare_sum)
        threshold = g_quare_sum.data.item()

        epoch_train_loss += iteration_loss
        epoch = int(iteration / iterations_epoch)
        print('Epoch {}, Loss:{}'.format(epoch, loss.data.item()))
        if True:
            #
            #'''
            if (iteration + 1) % iterations_epoch == 0:
                # 训练结束后进行test
                test_loss, test_acc = test_model(0,
                                                 model,
                                                 test_data,
                                                 criterion=criterion)
                epoch_train_loss = 0.0
            #'''
            f_trainloss.write(
                str(args.this_rank) + "\t" +
                str(epoch_train_loss / float(iterations_epoch)) + "\t" +
                str(iteration_loss) + "\t" + str(0) + "\t" + str(epoch) +
                "\t" + str(0) + "\t" + str(iteration) + "\t" +
                str(sparsification_ratio) +  # time
                "\t" + str(global_clock) +  # time
                "\t" + str(test_loss) +  # test_loss
                "\t" + str(test_acc) +  # test_acc
                '\n')
            f_trainloss.flush()
            #epoch_train_loss = 0.0
            # 在指定epochs (iterations) 减少缩放因子
            if (epoch + 1) in [0, 1000]:
                ratio = ratio * 0.1
                print('--------------------------------')
                print(ratio)

            for i in workers:
                models[i].train()
                if (epoch + 1) % decay_period == 0:
                    for param_group in optimizers_list[i - 1].param_groups:
                        param_group['lr'] *= 0.1
                        print('LR Decreased! Now: {}'.format(
                            param_group['lr']))

    f_trainloss.close()