Example #1
0
def run(workers, models, save_path, train_data_list, test_data, iterations_epoch):
    workers_num = len(workers)
    print('Model recved successfully!')
    optimizers_list = []
    for i in workers:
        if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
            optimizer = MySGD(models[i].parameters(), lr=0.1)
        else:
            optimizer = MySGD(models[i].parameters(), lr=0.01)
        optimizers_list.append(optimizer)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 500
    else:
        decay_period = 1000000

    print('Begin!')

    global_g = [torch.zeros_like(param.data) for param in model.parameters()]

    # store (train loss, energy, iterations)
    trainloss_file = './trainloss' + args.model + '_' + args.file_name + '_ec.txt'
    if(os.path.isfile(trainloss_file)):
        os.remove(trainloss_file)
    f_trainloss = open(trainloss_file, 'a')

    train_data_iter_list = []
    for i in workers:
        train_data_iter_list.append(iter(train_data_list[i-1]))

    epoch_train_loss = 0.0
    global_clock = 0
    g_remain_list = []
    ratio = args.ratio
    threshold = 0.

    # compensation
    h_last_list = []  # h_t
    h_remain_list = []  # h_t - 1
    alpha = args.alpha
    beta = args.beta
    print(alpha, " and ", beta)
    for iteration in range(args.epochs * iterations_epoch):
        iteration_loss = 0.0

        g_list = []
        g_change_average = [torch.zeros_like(param.data) for param in models[0].parameters()]
        global_clock += 1
        for i in workers:
            try:
                data, target = next(train_data_iter_list[i-1])
            except StopIteration:
                train_data_iter_list[i-1] = iter(train_data_list[i - 1])
                data, target = next(train_data_iter_list[i-1])
            data, target = Variable(data), Variable(target)
            optimizers_list[i-1].zero_grad()
            output = models[i](data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizers_list[i-1].get_delta_w()
            g_list.append(delta_ws)
            iteration_loss += loss.data.item()/workers_num

            if global_clock == 1:
                g_remain = [torch.zeros_like(g_layer)+g_layer for g_layer in delta_ws]
                g_remain_list.append(g_remain)

                h_remain = [torch.zeros_like(g_layer) for g_layer in delta_ws]
                h_remain_list.append(h_remain)

                h_last = [torch.zeros_like(g_layer) for g_layer in delta_ws]
                h_last_list.append(h_last)
                # synchronous update
                # the gradient change in the first iteration is gradient itself
                for g_change_layer_idx, g_change_layer in enumerate(g_change_average):
                    g_change_layer.data += delta_ws[g_change_layer_idx].data/workers_num
                sparsification_ratio = 1.0
            else:

                new_delta_ws = [torch.zeros_like(g_layer)+g_layer for g_layer in delta_ws]
                for idx, g_layer in enumerate(delta_ws):
                    # print(new_delta_ws[idx], " and ", alpha * (h_last_list[i-1][idx] - h_remain_list[i-1][idx]))
                    new_delta_ws[idx] += alpha * (h_last_list[i-1][idx] - h_remain_list[i-1][idx])
                print(ratio)
                g_remain, g_large_change, sparsification_ratio = get_upload(g_remain_list[i-1],new_delta_ws, ratio, args.isCompensate, threshold)
                g_remain_list[i-1] = g_remain
                # synchronous update
                for g_change_layer_idx, g_change_layer in enumerate(g_change_average):
                    g_change_layer.data += g_large_change[g_change_layer_idx].data/workers_num
                
                # update h
                h_last = [torch.zeros_like(g_layer) for g_layer in delta_ws]
                h_remain = h_last_list[i - 1]
                for idx, g_layer in enumerate(delta_ws):
                    h_last[idx] = h_remain[idx] * beta
                    if args.add == 1:
                        h_last[idx] += (delta_ws[idx] - g_remain[idx])
                    else:
                        h_last[idx] -= (delta_ws[idx] - g_remain[idx])
                h_remain_list[i - 1] = h_remain
                h_last_list[i - 1] = h_last
            
            



        # 同步操作
        g_quare_sum = 0.0   # for threshold
        for p_idx, param in enumerate(models[0].parameters()):
            global_g[p_idx].data += g_change_average[p_idx].data
            param.data -= global_g[p_idx].data
            for w in workers:
                list(models[w].parameters())[p_idx].data =  param.data + torch.zeros_like(param.data)

            g_quare_sum += torch.sum(global_g[p_idx].data * global_g[p_idx].data)

        g_quare_sum = torch.sqrt(g_quare_sum)
        threshold = g_quare_sum.data.item()

        epoch_train_loss += iteration_loss
        epoch = int(iteration / iterations_epoch)
        # print('Epoch {}, Loss:{}'.format(epoch, loss.data.item()))
        if (iteration+1) % iterations_epoch == 0:
            # 训练结束后进行test
            # test_loss, test_acc = test_model(0, model, test_data, criterion=criterion)
            f_trainloss.write(str(args.this_rank) +
                              "\t" + str(epoch_train_loss / float(iterations_epoch)) +
                              "\t" + str(iteration_loss) +
                              "\t" + str(0) +
                              "\t" + str(epoch) +
                              "\t" + str(0) +
                              "\t" + str(iteration) +
                              "\t" + str(sparsification_ratio) +        # time
                              "\t" + str(global_clock) +        # time
                              '\n')
            f_trainloss.flush()
            epoch_train_loss = 0.0
            # 在指定epochs (iterations) 减少缩放因子
            if (epoch + 1) in [0, 1000]:
                ratio = ratio * 0.1
                print('--------------------------------')
                print(ratio)

            for i in workers:
                models[i].train()
                if (epoch + 1) % decay_period == 0:
                    for param_group in optimizers_list[i - 1].param_groups:
                        param_group['lr'] *= 0.1
                        print('LR Decreased! Now: {}'.format(param_group['lr']))

    f_trainloss.close()
Example #2
0
def run(rank, model, train_data, test_data, queue, param_q, stop_flag):
    # Get the initial model from the server
    while True:
        if not param_q.empty():
            param_dict = param_q.get()
            tmp = OrderedDict(
                map(lambda item: (item[0], torch.from_numpy(item[1])),
                    param_dict.items()))
            model.load_state_dict(tmp)
            break
    print('Model recved successfully!')

    if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
        optimizer = MySGD(model.parameters(), lr=0.1)
    else:
        optimizer = MySGD(model.parameters(), lr=0.01)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 50
    else:
        decay_period = 100
    print('Begin!')

    time_logs = open("./record" + str(rank), 'w')

    for epoch in range(int(args.epochs)):
        model.train()
        # Decay the learning at the specific epoch
        #if args.model == 'AlexNet':
        if (epoch + 1) % decay_period == 0:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= 0.1
                print('LR Decreased! Now: {}'.format(param_group['lr']))
        epoch_train_loss = 0
        for batch_idx, (data, target) in enumerate(train_data):
            it_start = time.time()
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizer.get_delta_w()

            it_comp_end = time.time()
            # noinspection PyBroadException
            try:
                if delta_ws:
                    queue.put({
                        rank:
                        [loss.data.numpy(),
                         np.array(args.train_bsz), False]
                    })

                for delta in delta_ws:
                    dist.send(tensor=delta, dst=0)

                for idx, param in enumerate(model.parameters()):
                    tmp_tensor = torch.zeros_like(param.data)
                    dist.recv(tensor=tmp_tensor, src=0)
                    param.data = tmp_tensor

                #print('Rank {}, Epoch {}, Batch {}/{}, Loss:{}'
                #     .format(rank, epoch, batch_idx, len(train_data), loss.data[0]))
            except Exception as e:
                print(str(e))
                print('Should Stop: {}!'.format(stop_flag.value))
                break

            it_comm_end = time.time()
            it_duration = it_comm_end - it_start
            it_comp_duration = it_comp_end - it_start
            it_comm_duration = it_comm_end - it_comp_end
            time_logs.write(
                str(it_duration) + "\t" + str(it_comp_duration) + "\t" +
                str(it_comm_duration) + "\n")
            time_logs.flush()

        # test the model
        print("test Model:", epoch)
        # test_model(rank, model, test_data, criterion=criterion)
        if stop_flag.value:
            break
    queue.put({rank: [[], [], True]})
    time_logs.close()
    print("Worker {} has completed epoch {}!".format(args.this_rank, epoch))
Example #3
0
def run(rank, workers, model, save_path, train_data, test_data):
    # 获取ps端传来的模型初始参数
    _group = [w for w in workers].append(0)
    group = dist.new_group(_group)

    param_num = 0
    for p in model.parameters():
        tmp_p = torch.zeros_like(p)
        param_num += torch.numel(tmp_p)
        dist.scatter(tensor=tmp_p, src=0, group=group)
        p.data = tmp_p
    print('Model recved successfully!')

    compression_num = int(param_num * args.ratio)
    compression_num = compression_num if compression_num > 0 else 1
    dist.gather(torch.tensor([compression_num / param_num]),
                dst=0,
                group=group)

    if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
        learning_rate = 0.1
    else:
        learning_rate = args.lr
    optimizer = MySGD(model.parameters(), lr=learning_rate)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    elif args.model in ['Abalone', 'Bodyfat', 'Housing']:
        criterion = torch.nn.MSELoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 50
    elif args.model in [
            'LROnMnist', 'LROnCifar10', 'LROnCifar100', 'Abalone', 'Bodyfat',
            'Housing'
    ]:
        decay_period = 1000000  # learning rate is constant for LR (convex) models
    else:
        decay_period = 100

    print('Begin!')

    global_clock = 0
    g_remain = [torch.zeros_like(param.data) for param in model.parameters()]
    time_logs = open("./record" + str(rank), 'w')
    for epoch in range(args.epochs):
        batch_interval = 0.0
        batch_comp_interval = 0.0
        batch_comm_interval = 0.0
        s_time = time.time()
        model.train()

        # AlexNet在指定epoch减少学习率LR
        #if args.model == 'AlexNet':
        if (epoch + 1) % decay_period == 0:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= 0.1
                print('LR Decreased! Now: {}'.format(param_group['lr']))

        epoch_train_loss = 0
        for batch_idx, (data, target) in enumerate(train_data):
            batch_start_time = time.time()
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizer.get_delta_w()

            g_remain, g_large_change = get_upload(g_remain, delta_ws,
                                                  args.ratio,
                                                  args.isCompensate)

            batch_comp_time = time.time()
            # 同步操作
            # send epoch train loss firstly
            dist.gather(loss.data, dst=0, group=group)
            for idx, param in enumerate(model.parameters()):
                dist.gather(tensor=g_large_change[idx], dst=0, group=group)
                recv = torch.zeros_like(delta_ws[idx])
                dist.scatter(tensor=recv, src=0, group=group)
                param.data = recv

            epoch_train_loss += loss.data.item()
            batch_end_time = time.time()

            batch_interval += batch_end_time - batch_start_time
            batch_comp_interval += batch_comp_time - batch_start_time
            batch_comm_interval += batch_end_time - batch_comp_time

            logs = torch.tensor([
                0.0, batch_interval / (batch_idx + 1),
                batch_comp_interval / (batch_idx + 1),
                batch_comm_interval / (batch_idx + 1)
            ])
            time_logs.write(str(logs) + '\n')
            time_logs.flush()

        print('Rank {}, Epoch {}, Loss:{}'.format(rank, epoch,
                                                  loss.data.item()))

        e_time = time.time()
        #epoch_train_loss /= len(train_data)
        #epoch_train_loss = format(epoch_train_loss, '.4f')
        # 训练结束后进行test
        #test_loss, acc = test_model(rank, model, test_data, criterion=criterion)
        acc = 0.0
        batch_interval /= batch_idx + 1
        batch_comp_interval /= batch_idx + 1
        batch_comm_interval /= batch_idx + 1
        logs = torch.tensor(
            [acc, batch_interval, batch_comp_interval, batch_comm_interval])
        time_logs.write(str(logs) + '\n')
        time_logs.flush()
        #dist.gather(tensor=logs, dst = 0, group = group)
    time_logs.close()
Example #4
0
def run(rank, workers, model, save_path, train_data, test_data):
    # Get the initial model from the server
    _group = [w for w in workers].append(0)
    group = dist.new_group(_group)

    for p in model.parameters():
        tmp_p = torch.zeros_like(p)
        dist.scatter(tensor=tmp_p, src=0, group=group)
        p.data = tmp_p
    print('Model recved successfully!')

    if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
        optimizer = MySGD(model.parameters(), lr=0.1)
    else:
        optimizer = MySGD(model.parameters(), lr=0.01)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 50
    else:
        decay_period = 100

    print('Begin!')

    time_logs = open("./record" + str(rank), 'w')
    for epoch in range(args.epochs):
        batch_interval = 0.0
        batch_comp_interval = 0.0
        batch_comm_interval = 0.0
        s_time = time.time()
        model.train()

        # Reduce the learning rate LR in some specific epochs
        #if args.model == 'AlexNet':
        if (epoch + 1) % decay_period == 0:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= 0.1
                print('LR Decreased! Now: {}'.format(param_group['lr']))

        epoch_train_loss = 0
        for batch_idx, (data, target) in enumerate(train_data):
            batch_start_time = time.time()
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizer.get_delta_w()

            batch_comp_time = time.time()
            # Synchronization
            # send epoch train loss firstly
            dist.gather(loss.data, dst=0, group=group)
            for idx, param in enumerate(model.parameters()):
                dist.gather(tensor=delta_ws[idx], dst=0, group=group)
                recv = torch.zeros_like(delta_ws[idx])
                dist.scatter(tensor=recv, src=0, group=group)
                param.data = recv

            epoch_train_loss += loss.data.item()
            batch_end_time = time.time()

            batch_interval += batch_end_time - batch_start_time
            batch_comp_interval += batch_comp_time - batch_start_time
            batch_comm_interval += batch_end_time - batch_comp_time

            logs = torch.tensor([
                0.0, batch_interval / (batch_idx + 1),
                batch_comp_interval / (batch_idx + 1),
                batch_comm_interval / (batch_idx + 1)
            ])
            time_logs.write(str(logs) + '\n')
            time_logs.flush()

        print('Rank {}, Epoch {}, Loss:{}'.format(rank, epoch,
                                                  loss.data.item()))

        e_time = time.time()
        #epoch_train_loss /= len(train_data)
        #epoch_train_loss = format(epoch_train_loss, '.4f')
        # test the model
        #test_loss, acc = test_model(rank, model, test_data, criterion=criterion)
        acc = 0.0
        batch_interval /= batch_idx
        batch_comp_interval /= batch_idx
        batch_comm_interval /= batch_idx
        logs = torch.tensor(
            [acc, batch_interval, batch_comp_interval, batch_comm_interval])
        time_logs.write(str(logs) + '\n')
        time_logs.flush()
        #dist.gather(tensor=logs, dst = 0, group = group)
    time_logs.close()
Example #5
0
def run(workers, models, save_path, train_data_list, test_data,
        iterations_epoch):
    workers_num = len(workers)
    print('Model recved successfully!')
    optimizers_list = []
    if args.lr == 0.0:
        if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
            learning_rate = 0.1
        else:
            learning_rate = 0.01
    else:
        learning_rate = args.lr

    for i in workers:
        optimizer = MySGD(models[i].parameters(), lr=learning_rate)
        optimizers_list.append(optimizer)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 50
    else:
        decay_period = 1000

    print('Begin!')

    # store (train loss, energy, iterations)
    trainloss_file = './trainloss' + args.model + '.txt'
    if (os.path.isfile(trainloss_file)):
        os.remove(trainloss_file)
    f_trainloss = open(trainloss_file, 'a')

    log_file = args.model + 'log.txt'
    if (os.path.isfile(log_file)):
        os.remove(log_file)
    f_log = open(log_file, 'a')

    train_data_iter_list = []
    for i in workers:
        train_data_iter_list.append(iter(train_data_list[i - 1]))

    epoch_train_loss = 0.0
    total_time = 0.0
    total_pulling_ratio = 0.0
    epoch_avg_pull_ratio = 0.0

    clock_epoch = 0
    test_loss = 0
    test_acc = 0
    for iteration in range(args.epochs * iterations_epoch):
        clock_epoch += 1
        iteration_loss = 0.0
        epoch = int((iteration + 1) / iterations_epoch)
        for i in workers:
            models[i].train()

        g_list = []
        for i in workers:
            try:
                data, target = next(train_data_iter_list[i - 1])
            except StopIteration:
                train_data_iter_list[i - 1] = iter(train_data_list[i - 1])
                data, target = next(train_data_iter_list[i - 1])
            data, target = Variable(data), Variable(target)
            optimizers_list[i - 1].zero_grad()
            output = models[i](data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizers_list[i - 1].get_delta_w()
            g_list.append(delta_ws)
            iteration_loss += loss.data.item() / workers_num
        epoch_train_loss += iteration_loss

        g_q_list = []
        for g in g_list:
            g_quantization, compression_ratio = quantization(g, args.bit)
            g_q_list.append(g_quantization)

        # 同步操作
        g_avg = []
        for p_idx, param in enumerate(models[0].parameters()):
            global_update_layer = torch.zeros_like(param.data)
            for w in workers:
                global_update_layer += g_q_list[w - 1][p_idx]
            tensor = global_update_layer / workers_num
            g_avg.append(tensor)
            param.data -= tensor
        pull_workers = 0
        pull_workers_list = pull_judge(workers_num, args.ratio)
        for w in workers:
            isPulling = w in pull_workers_list
            if isPulling:
                pull_workers += 1
            for p_idx, param in enumerate(models[0].parameters()):
                if isPulling:
                    list(models[w].parameters())[p_idx].data = param.data
                else:
                    list(models[w].parameters())[p_idx].data -= g_q_list[
                        w - 1][p_idx]

        print('Epoch {}, Loss:{}'.format(epoch, loss.data.item()))
        total_pulling_ratio += pull_workers / workers_num
        epoch_avg_pull_ratio += pull_workers / workers_num
        f_log.write(
            str(args.this_rank) + "\t" + str(iteration_loss) + "\t" +
            str(epoch) + "\t" +
            str(pull_workers / workers_num) +  # the ratio of pulling workers
            "\t" + str(iteration) + "\t" + str(pull_workers_list) + '\n')
        f_log.flush()

        # train loss every epoch
        if iteration % iterations_epoch == 0:
            # 训练结束后进行test
            if iteration % (2 * iterations_epoch) == 0:
                test_loss, test_acc = test_model(0,
                                                 model,
                                                 test_data,
                                                 criterion=criterion)
            f_trainloss.write(
                str(args.this_rank) + "\t" +
                str(epoch_train_loss / float(clock_epoch)) + "\t" +
                str(test_loss) + "\t" + str(test_acc) + "\t" +
                str(total_pulling_ratio)
                +  # accumulated pulling ratio of workers
                "\t" + str(epoch) + "\t" +
                str(epoch_avg_pull_ratio / clock_epoch)
                +  # the avg ratio of pulling workers in an epoch
                "\t" + str(iteration) + "\t" + str(total_time) +  # time
                '\n')
            f_trainloss.flush()
            epoch_train_loss = 0.0
            epoch_avg_pull_ratio = 0.0
            clock_epoch = 0
            for i in workers:
                if (epoch + 1) % decay_period == 0:
                    for param_group in optimizers_list[i - 1].param_groups:
                        param_group['lr'] *= 0.1
                        print('LR Decreased! Now: {}'.format(
                            param_group['lr']))

    f_log.close()
    f_trainloss.close()
Example #6
0
def run(rank, workers, model, save_path, train_data, test_data, global_lr):
    # Get the initial model from the server
    print(workers)

    _group = [w for w in workers].append(0)
    group = dist.new_group(_group)

    for p in model.parameters():
        tmp_p = torch.zeros_like(p)
        dist.scatter(tensor=tmp_p, src=0, group=group)
        p.data = tmp_p
    print('Model recved successfully!')

    temp_lr = global_lr.get()

    if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
        optimizer = MySGD(model.parameters(), lr=temp_lr)
    else:
        optimizer = MySGD(model.parameters(), lr=temp_lr)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    print('Begin!')

    # the parameters that will be transferred to the thread
    model_cache = [p.data + 0.0 for p in model.parameters()]
    global_update = [torch.zeros_like(p) for p in model.parameters()]
    local_update = [torch.zeros_like(p) for p in model.parameters()]
    it_count = Value(c_float,
                     0.)  # count update times in an iteration by local worker
    data_lock = Lock()
    update_lock = Queue()
    update_lock.put(1)

    loss_t = torch.tensor(0.0)
    receive_end = Value(c_bool, False)
    batch_communication_interval = Value(c_float, 0.0)
    stale_in_iteration = Value(c_float, 0.)

    sender_td = Thread(target=sender,
                       args=(
                           model_cache,
                           global_update,
                           local_update,
                           it_count,
                           loss_t,
                           update_lock,
                           data_lock,
                           group,
                           receive_end,
                           batch_communication_interval,
                           stale_in_iteration,
                       ),
                       daemon=True)
    sender_td.start()

    time_logs = open("./record" + str(rank), 'w')
    osp_logs = open("./log" + str(rank), 'w')
    Stale_Threshold = args.stale_threshold
    for epoch in range(args.epochs):
        batch_interval = 0.0
        batch_comp_interval = 0.0
        s_time = time.time()
        model.train()

        # Decay the learning at the specific epoch
        # learning rate should be decreased on server due to unmatched updating speed between local worker and server
        if not global_lr.empty():
            g_lr = global_lr.get()
            if args.model == 'AlexNet':
                for param_group in optimizer.param_groups:
                    param_group['lr'] = g_lr
                    print('LR Decreased! Now: {}'.format(param_group['lr']))

        for batch_idx, (data, target) in enumerate(train_data):
            batch_start_time = time.time()
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizer.get_delta_w()

            optimizer.step()

            # Aggregate local update
            data_lock.acquire()
            # aggregate loss
            loss_t.data += loss.data
            it_count.value += 1
            for g_idx, update in enumerate(local_update):
                update.data += delta_ws[g_idx].data
            data_lock.release()

            batch_computation_time = time.time()

            # Open the lock once the local update has at least one gradient
            if it_count.value == 1:
                update_lock.put(1)
            while it_count.value >= Stale_Threshold:
                pass

            if receive_end.value:
                receive_end.value = False
                for idx, param in enumerate(model.parameters()):
                    param.data = model_cache[idx]  # without local update
                    # param.data = model_cache[idx] - global_update[idx] # with local update

            batch_end_time = time.time()
            batch_interval += batch_end_time - batch_start_time
            batch_comp_interval += batch_computation_time - batch_start_time
            osp_logs.write(
                str(batch_end_time - batch_start_time) + "\t" +
                str(batch_computation_time - batch_start_time) + "\n")
            osp_logs.flush()

        print('Rank {}, Epoch {}, Loss:{}'.format(rank, epoch,
                                                  loss.data.item()))

        e_time = time.time()
        # 训练结束后进行test
        #test_loss, acc = test_model(rank, model, test_data, criterion=criterion)
        acc = 0.0
        batch_interval /= batch_idx
        batch_comp_interval /= batch_idx
        logs = torch.tensor([
            acc, batch_interval, batch_comp_interval,
            batch_communication_interval.value, stale_in_iteration.value
        ])
        time_logs.write(str(logs) + '\n')
        time_logs.flush()
        # dist.gather(tensor=logs, dst = 0, group = group)
    time_logs.close()
    sender_td.join()
Example #7
0
def run(workers, models, save_path, train_data_list, test_data, ntokens,
        train_batch_size):
    workers_num = len(workers)
    print('Model recved successfully!')
    optimizers_list = []
    if args.lr == 0.0:
        if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
            learning_rate = 0.1
        else:
            learning_rate = 0.01
    else:
        learning_rate = args.lr

    for i in workers:
        optimizer = MySGD(models[i].parameters(), lr=learning_rate)
        optimizers_list.append(optimizer)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 500
    else:
        decay_period = 200

    data_save = pd.DataFrame(columns=[
        'Training Round', 'Training Loss', 'Training Perplexity', 'Test Loss',
        'Test Perplexity'
    ])
    print('Begin!')

    # store (train loss, energy, iterations)
    trainloss_file = args.save_path + '/trainloss' + args.model + '.txt'
    if (os.path.isfile(trainloss_file)):
        os.remove(trainloss_file)  # 删掉已有同名文件
    f_trainloss = open(trainloss_file, 'a')

    iterations_num_epoch = 0
    sequence_iter = range(0, train_data_list[workers_num - 1].size(0) - 1,
                          args.bptt)

    hidden_list = []
    for i in workers:
        hidden = models[i].init_hidden(train_batch_size)
        hidden_list.append(hidden)

    gamma = args.gamma
    first_label = True
    epoch_train_loss = 0.0
    test_loss = 10.0

    g_list = []
    for i in workers:
        g_temp = [torch.zeros_like(p.data) for p in models[0].parameters()]
        g_list.append(g_temp)
    it_count = 0
    s_time = time.time()

    epoch_loss = []

    for epoch in range(1, args.epochs + 1):
        epoch_start_time = time.time()
        for i in workers:
            models[i].train()
        iterations_epoch = 0
        user_loss = []
        for j in workers:
            sequence_iter = range(0, train_data_list[j - 1].size(0) - 1,
                                  args.bptt)
            batch_loss = []
            for batch, i in enumerate(sequence_iter):
                it_count += 1
                iterations_epoch += 1
                iteration_loss = 0.0
                print('Start Batch {} / {}'.format(batch, len(sequence_iter)))

                data, targets = get_batch(train_data_list[j - 1], i)
                # Starting each batch, we detach the hidden state from how it was previously produced.
                # If we didn't, the model would try backpropagating all the way to start of the dataset.
                hidden_list[j - 1] = repackage_hidden(hidden_list[j - 1])
                optimizers_list[j - 1].zero_grad()

                output, hidden_list[j - 1] = models[j](data,
                                                       hidden_list[j - 1])

                loss = criterion(output.view(-1, ntokens), targets)
                loss.backward()

                # `clip_grad_norm_` helps prevent the exploding gradient problem in RNNs / LSTMs.
                clip_grad_norm_(models[j].parameters(), 0.25)
                delta_ws = optimizers_list[j - 1].get_delta_w()
                # update local model and cache gradient into list
                for p_layer_idx, p_layer_temp in enumerate(
                        models[j].parameters()):
                    p_layer_temp.data -= delta_ws[p_layer_idx]
                    g_list[j - 1][p_layer_idx].data += delta_ws[p_layer_idx]

                iteration_loss += loss.data.item()  # worker i 当前round的平均loss
                batch_loss.append(loss.data.item())

            user_loss.append(sum(batch_loss) / len(batch_loss))

            epoch_train_loss += iteration_loss / workers_num

        epoch_loss.append(sum(user_loss) / len(user_loss))

        if epoch % args.K == 0:
            # Synchronization
            for p_idx, param in enumerate(models[0].parameters()):
                # in each worekr: update local model with the pulled global model and local update
                for w in workers:
                    if args.type == 'LOSP':
                        list(models[w].parameters(
                        ))[p_idx].data = param.data - gamma * g_list[w -
                                                                     1][p_idx]
                    elif args.type == 'OSP':
                        list(models[w].parameters()
                             )[p_idx].data = param.data + torch.zeros_like(
                                 param.data)
                    else:
                        pass

                # # in cloud : update global parameter with the average of all updates
                # global_update_layer = torch.zeros_like(param.data)
                # for w in workers:
                #     global_update_layer += g_list[w-1][p_idx]
                # tensor = global_update_layer / workers_num
                # param.data -= tensor

                # in cloud : update global parameter with the average of all updates
                for w in workers:
                    param.data -= g_list[w - 1][p_idx] / workers_num

                # in each worekr: update local model with the pulled global model and local update
                for w in workers:
                    if args.type == 'KAVG':
                        list(models[w].parameters()
                             )[p_idx].data = param.data + torch.zeros_like(
                                 param.data)
                    else:
                        pass
            g_list = []
            for w in workers:
                g_temp = [
                    torch.zeros_like(p.data) for p in models[0].parameters()
                ]
                g_list.append(g_temp)

        e_time = time.time()
        # train loss every epoch
        print('Epoch {}, Loss:{}'.format(epoch, loss.data.item()))

        # 训练结束后进行test
        if epoch % 5 == 0:
            # Run on test data.
            train_loss = sum(epoch_loss) / len(epoch_loss)
            epoch_loss = []
            test_loss = evaluate(models[0],
                                 ntokens,
                                 10,
                                 test_data,
                                 criterion=criterion)
            data_save.append([{
                'Training Round': epoch,
                'Training Loss': train_loss,
                'Training Perplexity': math.exp(train_loss),
                'Test Loss': test_loss,
                'Test Perplexity': math.exp(test_loss)
            }])
            data_save.to_csv('PTB_data.csv')
            print("test_loss:", test_loss)
        f_trainloss.write(
            str(args.this_rank) + "\t" +
            str(epoch_train_loss / float(iterations_epoch)) + "\t" +
            str(args.K) +  # args.K
            "\t" +
            str(e_time - epoch_start_time) +  # leave place for one epoch time
            "\t" + str(iterations_epoch) +  # leave place for overall time
            "\t" + str(math.exp(test_loss)) +  # leave place for perplexity
            "\t" + str(test_loss) +  # leave place for test accuracy
            "\t" + str(e_time - s_time) +  # leave place for total time
            "\t" + str(0) +  # leave place for one iteration time of comp
            "\t" +
            str(it_count) +  # leave place for one iteration time of comm
            "\t" + str(it_count / args.K) +  #global iterations
            "\t" + str(epoch) + '\n')
        f_trainloss.flush()
        epoch_train_loss = 0.0

        # 在指定epoch, gamma减半
        # 可以自己定义策略
        if (epoch + 1) > args.gamma_decay_epoch:
            if first_label:
                gamma = 0.01
                first_label = False

        if (epoch + 1) % decay_period == 0:
            for i in workers:
                for param_group in optimizers_list[i - 1].param_groups:
                    param_group['lr'] *= 0.1
                    print('LR Decreased! Now: {}'.format(param_group['lr']))

    f_trainloss.close()
Example #8
0
def run(workers, models, save_path, train_data_list, test_data, iterations_epoch):
    dev = torch.device('cuda')
    cpu = torch.device('cpu')

    param_num = 0
    for p in models[0].parameters():
        tmp_p = torch.zeros_like(p)
        param_num += torch.numel(tmp_p)

    models[0] = models[0].cuda(dev)
    for i in workers:
        models[i] = models[i].cuda(dev)

    workers_num = len(workers)
    print('Model recved successfully!')

    compression_num = int(param_num * args.ratio)
    compression_num = compression_num if compression_num > 0 else 1

    optimizers_list = []
    for i in workers:
        optimizer = MySGD(models[i].parameters(), lr=args.lr)
        # if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
        #     optimizer = MySGD(models[i].parameters(), lr=0.1)
        # elif args.model in ['VGG11']:
        #     optimizer = MySGD(models[i].parameters(), lr=0.1)
        # else:
        #     optimizer = MySGD(models[i].parameters(), lr=0.1)
        optimizers_list.append(optimizer)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    elif args.model in ['Abalone', 'Bodyfat', 'Housing']:
        criterion = torch.nn.MSELoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 10000
    else:
        decay_period = 1000000

    print('Begin!')

    # store (train loss, energy, iterations)
    # naming rules: title + model_name + number_of_workers
    trainloss_file = './../result/' + args.title + '_' + args.model + '_' + str(args.workers) + '.txt'
    if(os.path.isfile(trainloss_file)):
        os.remove(trainloss_file)
    f_trainloss = open(trainloss_file, 'a')

    train_data_iter_list = []
    for i in workers:
        train_data_iter_list.append(iter(train_data_list[i-1]))

    global_clock = 0
    g_remain_list = []
    for i in workers:
        g_remain = [torch.zeros_like(param.data) for param in models[i].parameters()]
        g_remain_list.append(g_remain)
    # time_logs = open("./record" + str(rank), 'w')
    for epoch in range(args.epochs):
        iteration_loss = 0.0

        # epoch_train_loss = 0
        g_change_average = [torch.zeros_like(param.data).cuda(dev) for param in models[0].parameters()]
        global_clock += 1
        for i in workers:
            try:
                data, target = next(train_data_iter_list[i-1])
            except StopIteration:
                train_data_iter_list[i-1] = iter(train_data_list[i - 1])
                data, target = next(train_data_iter_list[i-1])
            data, target = Variable(data).cuda(dev), Variable(target).cuda(dev)
            optimizers_list[i-1].zero_grad()
            output = models[i](data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizers_list[i-1].get_delta_w()
            iteration_loss += loss.data.item()/workers_num

            g_remain_list[i-1], g_large_change = get_upload(g_remain_list[i-1], delta_ws, args.ratio, args.isCompensate, dev)
            # synchronous update
            for g_change_layer_idx, g_change_layer in enumerate(g_change_average):
                g_change_layer.data += g_large_change[g_change_layer_idx].data/workers_num

        # 同步操作
        for p_idx, param in enumerate(models[0].parameters()):
            param.data -= g_change_average[p_idx].data
            for w in workers:
                list(models[w].parameters())[p_idx].data = param.data

        # epoch_train_loss += iteration_loss
        # epoch = int(iteration / iterations_epoch)
        print('Epoch {}, Loss:{}'.format(epoch, loss.data.item()))
        # if (iteration+1) % iterations_epoch == 0:
        # 训练结束后进行test
        # test_loss, test_acc = test_model(0, model, test_data, criterion=criterion)
        # f_trainloss.write(str(args.this_rank) +
        #                     "\t" + str(iteration_loss) +
        #                     "\t" + str(0) +
        #                     "\t" + str(epoch) +
        #                     "\t" + str(0) +
        #                     "\t" + str(sparsification_ratio) +        # time
        #                     "\t" + str(global_clock) +        # time
        #                     '\n')
        f_trainloss.write(str(epoch) + 
                            '\t' + str(global_clock) +
                            '\t' + str(iteration_loss) + 
                            '\t' + str(args.ratio) + 
                            '\n')
        f_trainloss.flush()
        # epoch_train_loss = 0.0
        # 在指定epochs (iterations) 减少缩放因子
        if (epoch + 1) in [0, 250000]:
            ratio = ratio * 0.1
            print('--------------------------------')
            print(ratio)

        # for i in workers:
        #     models[i].train()
        #     if (epoch + 1) % decay_period == 0:
        #         for param_group in optimizers_list[i - 1].param_groups:
        #             param_group['lr'] *= 0.1
        #             print('LR Decreased! Now: {}'.format(param_group['lr']))

    f_trainloss.close()
def run(workers, models, save_path, train_data_list, test_data, iterations_epoch):
    dev = torch.device('cuda')
    cpu = torch.device('cpu')

    start_time = time.time()
    models[0] = models[0].cuda(dev)
    for i in workers:
        models[i] = models[i].cuda(dev)

    workers_num = len(workers)

    print('Model recved successfully!')
    optimizers_list = []
    for i in workers:
        optimizer = MySGD(models[i].parameters(), lr=args.lr)
        # if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
        #     optimizer = MySGD(models[i].parameters(), lr=0.1)
        # elif args.model in ['VGG11']:
        #     optimizer = MySGD(models[i].parameters(), lr=0.1)
        # else:
        #     optimizer = MySGD(models[i].parameters(), lr=0.1)
        optimizers_list.append(optimizer)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 10000
    else:
        decay_period = 1000000

    print('Begin!')

    # the several workers in the front of the rank list
    byzantine_workers_list = [w + 1 for w in range(args.byzantine)]

    # cache g_old_num old gradients
    g_old_num = args.loops
    g_old_list = []
    for i in workers:
        worker_g_old_list = [[torch.zeros_like(param.data).cuda(dev) for param in model.parameters()] for _ in range(g_old_num)]
        g_old_list.append(worker_g_old_list)
    g_old_count = 0

    global_g = [torch.zeros_like(param.data).cuda(dev) for param in model.parameters()]

    # store (train loss, energy, iterations)
    # naming rules: title + model_name + number_of_workers
    trainloss_file = './mytopk' \
        + args.title \
        + '_' + args.method \
        + '_' + args.model \
        + '_B' + str(args.byzantine) \
        + '_V' + str(int(args.V)) \
        + '_E' + str(args.loops) \
        + '_R' + str(int(args.ratio * 1000)) \
        + '_al' + str(int(args.alpha * 1000)) \
        + '_be' + str(int(args.beta * 1000)) \
        + '_W' + str(args.workers) + '.txt'
    
    if(os.path.isfile(trainloss_file)):
        os.remove(trainloss_file)
    f_trainloss = open(trainloss_file, 'a')

    train_data_iter_list = []
    for i in workers:
        train_data_iter_list.append(iter(train_data_list[i-1]))

    epoch_train_loss = 0.0
    global_clock = 0
    g_remain_list = []
    h_remain_list = [] 
    h_last_list = [] 
    ratio = args.ratio
    threshold = 0.

    g_change_list = []
    for i in workers:
        g_change_list.append([torch.zeros_like(param.data).cuda(dev) for param in models[0].parameters()])
        h_remain_list.append([torch.zeros_like(param.data).cuda(dev) for param in models[0].parameters()])
        h_last_list.append([torch.zeros_like(param.data).cuda(dev) for param in models[0].parameters()])
    
    for epoch in range(args.epochs):
        iteration_loss = 0.0

        # g_change_average = [torch.zeros_like(param.data).cuda(dev) for param in models[0].parameters()]
        global_clock += 1
        g_change_average_list = [[] for _ in range(workers_num)]
        for i in workers:
            try:
                data, target = next(train_data_iter_list[i-1])
            except StopIteration:
                train_data_iter_list[i-1] = iter(train_data_list[i - 1])
                data, target = next(train_data_iter_list[i-1])
            data, target = Variable(data).cuda(dev), Variable(target).cuda(dev)
            optimizers_list[i-1].zero_grad()
            output = models[i](data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizers_list[i-1].get_delta_w()
            iteration_loss += loss.data.item()/workers_num

            # update old gradient list
            g_new = []
            for layer_g in delta_ws:
                layer_g_tmp = torch.zeros_like(layer_g).cuda(dev)
                layer_g_tmp += layer_g
                g_new.append(layer_g_tmp)
            g_old_list[i-1].append(g_new)   # cache new gradient
            g_old_list[i-1].pop(0)
            # count the number of gradient
            g_old_count = min(g_old_count+1, g_old_num)
            # g_old_count += 1
            # if g_old_count > g_old_num:
            #     g_old_count = g_old_num

            if global_clock == 1:
                g_remain = [torch.zeros_like(g_layer).cuda(dev)+g_layer for g_layer in delta_ws]
                g_remain_list.append(g_remain)
                # synchronous update
                # the gradient change in the first iteration is gradient itself
                for g_change_layer_idx, g_change_layer in enumerate(g_change_list[i - 1]):
                    g_change_layer.data += delta_ws[g_change_layer_idx].data

                    g_change_average_list[i - 1].append(g_change_layer.data)

                sparsification_ratio = 1.0
            else:
                update_new = []
                for layer_idx, layer_g in enumerate(delta_ws):
                    layer_update_new_tmp = torch.zeros_like(layer_g).cuda(dev)
                    for g_old in g_old_list[i-1]:
                        layer_update_new_tmp += g_old[layer_idx]
                    layer_update_new_tmp /= g_old_count
                    update_new.append(layer_update_new_tmp)
                # print(g_old_count)
                # g_remain, g_large_change, sparsification_ratio= get_upload(g_remain_list[i-1],update_new,ratio,args.isCompensate, threshold, dev)
                new_g_avg = [torch.zeros_like(g_layer) + g_layer for g_layer in update_new]
                for idx, g_layer in enumerate(update_new):
                    new_g_avg[idx] += args.alpha * (h_last_list[i - 1][idx] - h_remain_list[i - 1][idx])

                g_remain, g_large_change = get_upload_topk(g_remain_list[i - 1], new_g_avg, args.ratio, args.isCompensate, dev)
                g_remain_list[i - 1] = g_remain

                # if i in byzantine_workers_list:
                #     g_large_change = byzantine_func(g_large_change, dev)
                
                h_remain_list[i - 1] = h_last_list[i - 1]
                h_last_list[i - 1] = [torch.zeros_like(g_layer) for g_layer in update_new]
                for idx, g_layer in enumerate(update_new):
                    h_last_list[i - 1][idx] = h_remain_list[i - 1][idx] * args.beta
                    h_last_list[i - 1][idx] -= (update_new[idx] - g_remain[idx])

                for g_change_layer_idx, g_change_layer in enumerate(g_change_list[i - 1]):
                    g_change_layer.data += g_large_change[g_change_layer_idx].data
                    g_change_average_list[i - 1].append(g_change_layer)
                
                # if i in byzantine_workers_list:
                #     g_remain_list[i - 1] = g_change_list[i - 1]
                
                # for g_change_layer_idx, g_change_layer in enumerate(g_change_list[i - 1]):
                #     g_remain_list[i - 1][g_change_layer_idx] = g_change_layer + torch.zeros_like(g_change_layer).cuda(dev)

                # if i in byzantine_workers_list:
                #     g_change_layer.data += g_large_change[g_change_layer_idx].data
                #     g_change_layer.data += args.V * torch.randn_like(g_change_layer.data).data
                # else:
                #     g_change_layer.data += g_large_change[g_change_layer_idx].data
                
                # g_change_average_list[i - 1].append(g_change_layer.data)
                
                # if i in byzantine_workers_list:
                #     by_list = byzantine_func(g_change_list[i - 1], dev)
                #     for g_change_layer in by_list:
                #         g_change_average_list[i - 1].append(g_change_layer)
                # else:
                #     for g_change_layer in g_change_list[i - 1]:
                #         g_change_average_list[i - 1].append(g_change_layer)
        
        # non_byz_g = []
        # for p_idx, param in enumerate(models[0].parameters()):
        #     global_update_layer = torch.zeros_like(param.data).cuda(dev)
        #     for w in workers:
        #         if w not in byzantine_workers_list:
        #             global_update_layer += g_change_average_list[w - 1][p_idx]
        #     tensor = global_update_layer / (workers_num - args.byzantine)
        #     non_byz_g.append(tensor)
        
        # non_byz_g = byzantine_func(non_byz_g, dev)

        # for i in workers:
        #     if i in byzantine_workers_list:
        #         g_change_average_list[i - 1] = []
        #         for g_change_layer in non_byz_g:
        #             g_change_average_list[i - 1].append(g_change_layer + torch.zeros_like(g_change_layer).cuda(dev))
        
        # 同步操作
        if args.method == "Mean":
            g_median = mean(g_change_average_list, workers, dev)
        elif args.method == "TrimmedMean":
            # if args.T > 0 and args.T < workers_num/2:
            #     beta = args.T
            # else:
            #     beta = int((workers_num-1)/2)
            g_median = trimmed_mean(g_change_average_list, workers, args.byzantine, dev)
        elif args.method == "Median":
            g_median = median_defense(g_change_average_list, workers, dev)
        elif args.method == "FABA":
            g_median = FABA(g_change_average_list, workers, args.byzantine, dev)
        elif args.method == "Krum":
            g_median = Krum(g_change_average_list, workers, args.byzantine, dev)
        
        g_quare_sum = 0.0   # for threshold
        for p_idx, param in enumerate(models[0].parameters()):
            param.data -= g_median[p_idx].data
            # print(g_median[p_idx].data)
            for w in workers:
                list(models[w].parameters())[p_idx].data = param.data + torch.zeros_like(param.data).cuda(dev)

            g_quare_sum += torch.sum(g_median[p_idx].data * g_median[p_idx].data)

        g_quare_sum = torch.sqrt(g_quare_sum).cuda(dev)
        threshold = g_quare_sum.data.item()

        # epoch_train_loss += iteration_loss
        # epoch = int(iteration / iterations_epoch)
        current_time = time.time() - start_time
        test_acc = 0
        if epoch % 50 >= 45:
            test_acc = test_model(0, models[1], test_data, dev)
        print('Epoch {}, Time:{}, Loss:{}'.format(epoch, current_time, iteration_loss))
        f_trainloss.write(str(epoch) +
                            '\t' + str(current_time) +
                            '\t' + str(iteration_loss) + 
                            '\t' + str(sparsification_ratio) + 
                            # '\t' + str(test_loss) + 
                            '\t' + str(test_acc) +
                            '\n')
        f_trainloss.flush()
        # if (iteration+1) % iterations_epoch == 0:
            # 训练结束后进行test
            # test_loss, test_acc = test_model(0, model, test_data, criterion=criterion)
            # f_trainloss.write(str(args.this_rank) +
            #                   "\t" + str(epoch_train_loss / float(iterations_epoch)) +
            #                   "\t" + str(iteration_loss) +
            #                   "\t" + str(0) +
            #                   "\t" + str(epoch) +
            #                   "\t" + str(0) +
            #                   "\t" + str(iteration) +
            #                   "\t" + str(sparsification_ratio) +        # time
            #                   "\t" + str(global_clock) +        # time
            #                   '\n')
        
            # epoch_train_loss = 0.0
        # 在指定epochs (iterations) 减少缩放因子
        # if (epoch + 1) in [0, 250000]:
        #     ratio = ratio * 0.1
        #     print('--------------------------------')
        #     print(ratio)

            # for i in workers:
            #     models[i].train()
            #     if (epoch + 1) % decay_period == 0:
            #         for param_group in optimizers_list[i - 1].param_groups:
            #             param_group['lr'] *= 0.1
            #             print('LR Decreased! Now: {}'.format(param_group['lr']))

    f_trainloss.close()
def run(workers, models, save_path, train_data_list, test_data,
        iterations_epoch):
    workers_num = len(workers)
    print('Model recved successfully!')
    optimizers_list = []
    for i in workers:
        if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
            optimizer = MySGD(models[i].parameters(), lr=0.1)
        else:
            optimizer = MySGD(models[i].parameters(), lr=0.01)
        optimizers_list.append(optimizer)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 500
    else:
        decay_period = 1000000

    print('Begin!')

    global_g = [torch.zeros_like(param.data) for param in model.parameters()]

    # store (train loss, energy, iterations)
    trainloss_file = './trainloss_oldsimu' + args.model + '_w15r1lr0.1.txt'
    if (os.path.isfile(trainloss_file)):
        os.remove(trainloss_file)
    f_trainloss = open(trainloss_file, 'a')

    train_data_iter_list = []
    for i in workers:
        train_data_iter_list.append(iter(train_data_list[i - 1]))

    epoch_train_loss = 0.0
    global_clock = 0
    g_remain_list = []
    ratio = args.ratio
    threshold = 0.
    print("Begin for")
    for iteration in range(args.epochs * iterations_epoch):
        iteration_loss = 0.0

        g_list = []
        g_change_average = [
            torch.zeros_like(param.data) for param in models[0].parameters()
        ]
        global_clock += 1
        for i in workers:  # zheli
            try:
                data, target = next(train_data_iter_list[i - 1])
            except StopIteration:
                train_data_iter_list[i - 1] = iter(train_data_list[i - 1])
                data, target = next(train_data_iter_list[i - 1])
            data, target = Variable(data), Variable(target)
            optimizers_list[i - 1].zero_grad()
            output = models[i](data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizers_list[i - 1].get_delta_w()  # zheli gzhi
            g_list.append(delta_ws)
            iteration_loss += loss.data.item() / workers_num

            if global_clock == 1:  # chushihua diyilun
                g_remain = [
                    torch.zeros_like(g_layer) + g_layer for g_layer in delta_ws
                ]
                g_remain_list.append(g_remain)

                test_loss = str(
                    0)  ######################################################
                test_acc = str(
                    0)  ######################################################

                # synchronous update
                # the gradient change in the first iteration is gradient itself
                for g_change_layer_idx, g_change_layer in enumerate(
                        g_change_average):
                    g_change_layer.data += delta_ws[
                        g_change_layer_idx].data / workers_num
                sparsification_ratio = 1.0
            else:
                # print(delta_ws)        # 2 huge?
                g_remain, g_large_change, sparsification_ratio = get_upload(
                    g_remain_list[i - 1], delta_ws, ratio, args.isCompensate,
                    threshold)  #hanshu
                g_remain_list[i - 1] = g_remain  # server g gengxin
                # synchronous update
                for g_change_layer_idx, g_change_layer in enumerate(
                        g_change_average):  # line5
                    g_change_layer.data += g_large_change[
                        g_change_layer_idx].data / workers_num  #line5

        # 同步操作
        g_quare_sum = 0.0  # for threshold     server
        for p_idx, param in enumerate(models[0].parameters()):
            global_g[p_idx].data += g_change_average[
                p_idx].data  #zheli   delta pingjun
            param.data -= global_g[p_idx].data
            for w in workers:  #gengxin w
                list(models[w].
                     parameters())[p_idx].data = param.data + torch.zeros_like(
                         param.data)  #hui chuan

            g_quare_sum += torch.sum(global_g[p_idx].data *
                                     global_g[p_idx].data)  #server    buxishuo

        g_quare_sum = torch.sqrt(g_quare_sum)
        threshold = g_quare_sum.data.item()

        epoch_train_loss += iteration_loss
        epoch = int(iteration / iterations_epoch)
        print('Epoch {}, Loss:{}'.format(epoch, loss.data.item()))
        if True:
            #
            #'''
            if (iteration + 1) % iterations_epoch == 0:
                # 训练结束后进行test
                test_loss, test_acc = test_model(0,
                                                 model,
                                                 test_data,
                                                 criterion=criterion)
                epoch_train_loss = 0.0
            #'''
            f_trainloss.write(
                str(args.this_rank) + "\t" +
                str(epoch_train_loss / float(iterations_epoch)) + "\t" +
                str(iteration_loss) + "\t" + str(0) + "\t" + str(epoch) +
                "\t" + str(0) + "\t" + str(iteration) + "\t" +
                str(sparsification_ratio) +  # time
                "\t" + str(global_clock) +  # time
                "\t" + str(test_loss) +  # test_loss
                "\t" + str(test_acc) +  # test_acc
                '\n')
            f_trainloss.flush()
            #epoch_train_loss = 0.0
            # 在指定epochs (iterations) 减少缩放因子
            if (epoch + 1) in [0, 1000]:
                ratio = ratio * 0.1
                print('--------------------------------')
                print(ratio)

            for i in workers:
                models[i].train()
                if (epoch + 1) % decay_period == 0:
                    for param_group in optimizers_list[i - 1].param_groups:
                        param_group['lr'] *= 0.1
                        print('LR Decreased! Now: {}'.format(
                            param_group['lr']))

    f_trainloss.close()
Example #11
0
def run(rank, model, train_data, test_data, queue, param_q, stop_flag):
    # 获取ps端传来的模型初始参数
    while True:
        if not param_q.empty():
            param_dict = param_q.get()
            tmp = OrderedDict(
                map(lambda item: (item[0], torch.from_numpy(item[1])),
                    param_dict.items()))
            model.load_state_dict(tmp)
            break
    print('Model recved successfully!')

    if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']:
        optimizer = MySGD(model.parameters(), lr=0.1)
    else:
        optimizer = MySGD(model.parameters(), lr=0.01)

    if args.model in ['MnistCNN', 'AlexNet']:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if args.model in ['AlexNet', 'ResNet18OnCifar10']:
        decay_period = 50
    else:
        decay_period = 100
    print('Begin!')

    time_logs = open("./record" + str(rank), 'w')
    for epoch in range(int(args.epochs)):
        batch_interval = 0.0
        batch_comp_interval = 0.0
        batch_comm_interval = 0.0
        batch_push_interval = 0.0
        batch_pull_interval = 0.0
        model.train()
        # AlexNet在指定epoch减少学习率LR
        #if args.model == 'AlexNet':
        if (epoch + 1) % decay_period == 0:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= 0.1
                print('LR Decreased! Now: {}'.format(param_group['lr']))
        epoch_train_loss = 0
        for batch_idx, (data, target) in enumerate(train_data):
            batch_start_time = time.time()
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            delta_ws = optimizer.get_delta_w()

            batch_comp_time = time.time()
            # noinspection PyBroadException
            try:  # 捕获异常,异常来源于ps进程的停止
                if delta_ws:
                    queue.put({
                        rank:
                        [loss.data.numpy(),
                         np.array(args.train_bsz), False]
                    })
                for delta in delta_ws:
                    dist.send(tensor=delta, dst=0)

                batch_push_time = time.time()

                for idx, param in enumerate(model.parameters()):
                    tmp_tensor = torch.zeros_like(param.data)
                    dist.recv(tensor=tmp_tensor, src=0)
                    param.data = tmp_tensor

                batch_tmp_time = time.time()
                batch_pull_time = time.time()
                #print('Rank {}, Epoch {}, Batch {}/{}, Loss:{}'
                #     .format(rank, epoch, batch_idx, len(train_data), loss.data[0]))
            except Exception as e:
                print(str(e))
                print('Should Stop: {}!'.format(stop_flag.value))
                break

            batch_interval += batch_pull_time - batch_start_time
            batch_comp_interval += batch_comp_time - batch_start_time
            batch_comm_interval += batch_pull_time - batch_comp_time
            batch_push_interval += batch_push_time - batch_comp_time
            batch_pull_interval += batch_pull_time - batch_push_time
            b_interval = batch_interval / (batch_idx + 1)
            b_comp_interval = batch_comp_interval / (batch_idx + 1)
            b_comm_interval = batch_comm_interval / (batch_idx + 1)
            b_push_interval = batch_push_interval / (batch_idx + 1)
            b_pull_interval = batch_pull_interval / (batch_idx + 1)
            logs = torch.tensor([
                0.0, b_interval, b_comp_interval, b_comm_interval,
                b_push_interval, b_pull_interval,
                batch_pull_time - batch_tmp_time
            ])
            time_logs.write(str(logs) + '\n')
            time_logs.flush()

        batch_interval /= batch_idx
        batch_comp_interval /= batch_idx
        batch_comm_interval /= batch_idx
        batch_push_interval /= batch_idx
        batch_pull_interval /= batch_idx
        logs = torch.tensor([
            0.0, batch_interval, batch_comp_interval, batch_comm_interval,
            batch_push_interval, batch_pull_interval
        ])
        time_logs.write(str(epoch) + '\t' + str(logs) + '\n')
        time_logs.flush()
        # 训练结束后进行test
        print("test Model:", epoch)
        # test_model(rank, model, test_data, criterion=criterion)
        if stop_flag.value:
            break
    queue.put({rank: [[], [], True]})
    time_logs.close()
    print("Worker {} has completed epoch {}!".format(args.this_rank, epoch))