def run(workers, models, save_path, train_data_list, test_data, iterations_epoch): workers_num = len(workers) print('Model recved successfully!') optimizers_list = [] for i in workers: if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: optimizer = MySGD(models[i].parameters(), lr=0.1) else: optimizer = MySGD(models[i].parameters(), lr=0.01) optimizers_list.append(optimizer) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 500 else: decay_period = 1000000 print('Begin!') global_g = [torch.zeros_like(param.data) for param in model.parameters()] # store (train loss, energy, iterations) trainloss_file = './trainloss' + args.model + '_' + args.file_name + '_ec.txt' if(os.path.isfile(trainloss_file)): os.remove(trainloss_file) f_trainloss = open(trainloss_file, 'a') train_data_iter_list = [] for i in workers: train_data_iter_list.append(iter(train_data_list[i-1])) epoch_train_loss = 0.0 global_clock = 0 g_remain_list = [] ratio = args.ratio threshold = 0. # compensation h_last_list = [] # h_t h_remain_list = [] # h_t - 1 alpha = args.alpha beta = args.beta print(alpha, " and ", beta) for iteration in range(args.epochs * iterations_epoch): iteration_loss = 0.0 g_list = [] g_change_average = [torch.zeros_like(param.data) for param in models[0].parameters()] global_clock += 1 for i in workers: try: data, target = next(train_data_iter_list[i-1]) except StopIteration: train_data_iter_list[i-1] = iter(train_data_list[i - 1]) data, target = next(train_data_iter_list[i-1]) data, target = Variable(data), Variable(target) optimizers_list[i-1].zero_grad() output = models[i](data) loss = criterion(output, target) loss.backward() delta_ws = optimizers_list[i-1].get_delta_w() g_list.append(delta_ws) iteration_loss += loss.data.item()/workers_num if global_clock == 1: g_remain = [torch.zeros_like(g_layer)+g_layer for g_layer in delta_ws] g_remain_list.append(g_remain) h_remain = [torch.zeros_like(g_layer) for g_layer in delta_ws] h_remain_list.append(h_remain) h_last = [torch.zeros_like(g_layer) for g_layer in delta_ws] h_last_list.append(h_last) # synchronous update # the gradient change in the first iteration is gradient itself for g_change_layer_idx, g_change_layer in enumerate(g_change_average): g_change_layer.data += delta_ws[g_change_layer_idx].data/workers_num sparsification_ratio = 1.0 else: new_delta_ws = [torch.zeros_like(g_layer)+g_layer for g_layer in delta_ws] for idx, g_layer in enumerate(delta_ws): # print(new_delta_ws[idx], " and ", alpha * (h_last_list[i-1][idx] - h_remain_list[i-1][idx])) new_delta_ws[idx] += alpha * (h_last_list[i-1][idx] - h_remain_list[i-1][idx]) print(ratio) g_remain, g_large_change, sparsification_ratio = get_upload(g_remain_list[i-1],new_delta_ws, ratio, args.isCompensate, threshold) g_remain_list[i-1] = g_remain # synchronous update for g_change_layer_idx, g_change_layer in enumerate(g_change_average): g_change_layer.data += g_large_change[g_change_layer_idx].data/workers_num # update h h_last = [torch.zeros_like(g_layer) for g_layer in delta_ws] h_remain = h_last_list[i - 1] for idx, g_layer in enumerate(delta_ws): h_last[idx] = h_remain[idx] * beta if args.add == 1: h_last[idx] += (delta_ws[idx] - g_remain[idx]) else: h_last[idx] -= (delta_ws[idx] - g_remain[idx]) h_remain_list[i - 1] = h_remain h_last_list[i - 1] = h_last # 同步操作 g_quare_sum = 0.0 # for threshold for p_idx, param in enumerate(models[0].parameters()): global_g[p_idx].data += g_change_average[p_idx].data param.data -= global_g[p_idx].data for w in workers: list(models[w].parameters())[p_idx].data = param.data + torch.zeros_like(param.data) g_quare_sum += torch.sum(global_g[p_idx].data * global_g[p_idx].data) g_quare_sum = torch.sqrt(g_quare_sum) threshold = g_quare_sum.data.item() epoch_train_loss += iteration_loss epoch = int(iteration / iterations_epoch) # print('Epoch {}, Loss:{}'.format(epoch, loss.data.item())) if (iteration+1) % iterations_epoch == 0: # 训练结束后进行test # test_loss, test_acc = test_model(0, model, test_data, criterion=criterion) f_trainloss.write(str(args.this_rank) + "\t" + str(epoch_train_loss / float(iterations_epoch)) + "\t" + str(iteration_loss) + "\t" + str(0) + "\t" + str(epoch) + "\t" + str(0) + "\t" + str(iteration) + "\t" + str(sparsification_ratio) + # time "\t" + str(global_clock) + # time '\n') f_trainloss.flush() epoch_train_loss = 0.0 # 在指定epochs (iterations) 减少缩放因子 if (epoch + 1) in [0, 1000]: ratio = ratio * 0.1 print('--------------------------------') print(ratio) for i in workers: models[i].train() if (epoch + 1) % decay_period == 0: for param_group in optimizers_list[i - 1].param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format(param_group['lr'])) f_trainloss.close()
def run(rank, model, train_data, test_data, queue, param_q, stop_flag): # Get the initial model from the server while True: if not param_q.empty(): param_dict = param_q.get() tmp = OrderedDict( map(lambda item: (item[0], torch.from_numpy(item[1])), param_dict.items())) model.load_state_dict(tmp) break print('Model recved successfully!') if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: optimizer = MySGD(model.parameters(), lr=0.1) else: optimizer = MySGD(model.parameters(), lr=0.01) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 50 else: decay_period = 100 print('Begin!') time_logs = open("./record" + str(rank), 'w') for epoch in range(int(args.epochs)): model.train() # Decay the learning at the specific epoch #if args.model == 'AlexNet': if (epoch + 1) % decay_period == 0: for param_group in optimizer.param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format(param_group['lr'])) epoch_train_loss = 0 for batch_idx, (data, target) in enumerate(train_data): it_start = time.time() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() delta_ws = optimizer.get_delta_w() it_comp_end = time.time() # noinspection PyBroadException try: if delta_ws: queue.put({ rank: [loss.data.numpy(), np.array(args.train_bsz), False] }) for delta in delta_ws: dist.send(tensor=delta, dst=0) for idx, param in enumerate(model.parameters()): tmp_tensor = torch.zeros_like(param.data) dist.recv(tensor=tmp_tensor, src=0) param.data = tmp_tensor #print('Rank {}, Epoch {}, Batch {}/{}, Loss:{}' # .format(rank, epoch, batch_idx, len(train_data), loss.data[0])) except Exception as e: print(str(e)) print('Should Stop: {}!'.format(stop_flag.value)) break it_comm_end = time.time() it_duration = it_comm_end - it_start it_comp_duration = it_comp_end - it_start it_comm_duration = it_comm_end - it_comp_end time_logs.write( str(it_duration) + "\t" + str(it_comp_duration) + "\t" + str(it_comm_duration) + "\n") time_logs.flush() # test the model print("test Model:", epoch) # test_model(rank, model, test_data, criterion=criterion) if stop_flag.value: break queue.put({rank: [[], [], True]}) time_logs.close() print("Worker {} has completed epoch {}!".format(args.this_rank, epoch))
def run(rank, workers, model, save_path, train_data, test_data): # 获取ps端传来的模型初始参数 _group = [w for w in workers].append(0) group = dist.new_group(_group) param_num = 0 for p in model.parameters(): tmp_p = torch.zeros_like(p) param_num += torch.numel(tmp_p) dist.scatter(tensor=tmp_p, src=0, group=group) p.data = tmp_p print('Model recved successfully!') compression_num = int(param_num * args.ratio) compression_num = compression_num if compression_num > 0 else 1 dist.gather(torch.tensor([compression_num / param_num]), dst=0, group=group) if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: learning_rate = 0.1 else: learning_rate = args.lr optimizer = MySGD(model.parameters(), lr=learning_rate) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() elif args.model in ['Abalone', 'Bodyfat', 'Housing']: criterion = torch.nn.MSELoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 50 elif args.model in [ 'LROnMnist', 'LROnCifar10', 'LROnCifar100', 'Abalone', 'Bodyfat', 'Housing' ]: decay_period = 1000000 # learning rate is constant for LR (convex) models else: decay_period = 100 print('Begin!') global_clock = 0 g_remain = [torch.zeros_like(param.data) for param in model.parameters()] time_logs = open("./record" + str(rank), 'w') for epoch in range(args.epochs): batch_interval = 0.0 batch_comp_interval = 0.0 batch_comm_interval = 0.0 s_time = time.time() model.train() # AlexNet在指定epoch减少学习率LR #if args.model == 'AlexNet': if (epoch + 1) % decay_period == 0: for param_group in optimizer.param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format(param_group['lr'])) epoch_train_loss = 0 for batch_idx, (data, target) in enumerate(train_data): batch_start_time = time.time() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() delta_ws = optimizer.get_delta_w() g_remain, g_large_change = get_upload(g_remain, delta_ws, args.ratio, args.isCompensate) batch_comp_time = time.time() # 同步操作 # send epoch train loss firstly dist.gather(loss.data, dst=0, group=group) for idx, param in enumerate(model.parameters()): dist.gather(tensor=g_large_change[idx], dst=0, group=group) recv = torch.zeros_like(delta_ws[idx]) dist.scatter(tensor=recv, src=0, group=group) param.data = recv epoch_train_loss += loss.data.item() batch_end_time = time.time() batch_interval += batch_end_time - batch_start_time batch_comp_interval += batch_comp_time - batch_start_time batch_comm_interval += batch_end_time - batch_comp_time logs = torch.tensor([ 0.0, batch_interval / (batch_idx + 1), batch_comp_interval / (batch_idx + 1), batch_comm_interval / (batch_idx + 1) ]) time_logs.write(str(logs) + '\n') time_logs.flush() print('Rank {}, Epoch {}, Loss:{}'.format(rank, epoch, loss.data.item())) e_time = time.time() #epoch_train_loss /= len(train_data) #epoch_train_loss = format(epoch_train_loss, '.4f') # 训练结束后进行test #test_loss, acc = test_model(rank, model, test_data, criterion=criterion) acc = 0.0 batch_interval /= batch_idx + 1 batch_comp_interval /= batch_idx + 1 batch_comm_interval /= batch_idx + 1 logs = torch.tensor( [acc, batch_interval, batch_comp_interval, batch_comm_interval]) time_logs.write(str(logs) + '\n') time_logs.flush() #dist.gather(tensor=logs, dst = 0, group = group) time_logs.close()
def run(rank, workers, model, save_path, train_data, test_data): # Get the initial model from the server _group = [w for w in workers].append(0) group = dist.new_group(_group) for p in model.parameters(): tmp_p = torch.zeros_like(p) dist.scatter(tensor=tmp_p, src=0, group=group) p.data = tmp_p print('Model recved successfully!') if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: optimizer = MySGD(model.parameters(), lr=0.1) else: optimizer = MySGD(model.parameters(), lr=0.01) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 50 else: decay_period = 100 print('Begin!') time_logs = open("./record" + str(rank), 'w') for epoch in range(args.epochs): batch_interval = 0.0 batch_comp_interval = 0.0 batch_comm_interval = 0.0 s_time = time.time() model.train() # Reduce the learning rate LR in some specific epochs #if args.model == 'AlexNet': if (epoch + 1) % decay_period == 0: for param_group in optimizer.param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format(param_group['lr'])) epoch_train_loss = 0 for batch_idx, (data, target) in enumerate(train_data): batch_start_time = time.time() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() delta_ws = optimizer.get_delta_w() batch_comp_time = time.time() # Synchronization # send epoch train loss firstly dist.gather(loss.data, dst=0, group=group) for idx, param in enumerate(model.parameters()): dist.gather(tensor=delta_ws[idx], dst=0, group=group) recv = torch.zeros_like(delta_ws[idx]) dist.scatter(tensor=recv, src=0, group=group) param.data = recv epoch_train_loss += loss.data.item() batch_end_time = time.time() batch_interval += batch_end_time - batch_start_time batch_comp_interval += batch_comp_time - batch_start_time batch_comm_interval += batch_end_time - batch_comp_time logs = torch.tensor([ 0.0, batch_interval / (batch_idx + 1), batch_comp_interval / (batch_idx + 1), batch_comm_interval / (batch_idx + 1) ]) time_logs.write(str(logs) + '\n') time_logs.flush() print('Rank {}, Epoch {}, Loss:{}'.format(rank, epoch, loss.data.item())) e_time = time.time() #epoch_train_loss /= len(train_data) #epoch_train_loss = format(epoch_train_loss, '.4f') # test the model #test_loss, acc = test_model(rank, model, test_data, criterion=criterion) acc = 0.0 batch_interval /= batch_idx batch_comp_interval /= batch_idx batch_comm_interval /= batch_idx logs = torch.tensor( [acc, batch_interval, batch_comp_interval, batch_comm_interval]) time_logs.write(str(logs) + '\n') time_logs.flush() #dist.gather(tensor=logs, dst = 0, group = group) time_logs.close()
def run(workers, models, save_path, train_data_list, test_data, iterations_epoch): workers_num = len(workers) print('Model recved successfully!') optimizers_list = [] if args.lr == 0.0: if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: learning_rate = 0.1 else: learning_rate = 0.01 else: learning_rate = args.lr for i in workers: optimizer = MySGD(models[i].parameters(), lr=learning_rate) optimizers_list.append(optimizer) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 50 else: decay_period = 1000 print('Begin!') # store (train loss, energy, iterations) trainloss_file = './trainloss' + args.model + '.txt' if (os.path.isfile(trainloss_file)): os.remove(trainloss_file) f_trainloss = open(trainloss_file, 'a') log_file = args.model + 'log.txt' if (os.path.isfile(log_file)): os.remove(log_file) f_log = open(log_file, 'a') train_data_iter_list = [] for i in workers: train_data_iter_list.append(iter(train_data_list[i - 1])) epoch_train_loss = 0.0 total_time = 0.0 total_pulling_ratio = 0.0 epoch_avg_pull_ratio = 0.0 clock_epoch = 0 test_loss = 0 test_acc = 0 for iteration in range(args.epochs * iterations_epoch): clock_epoch += 1 iteration_loss = 0.0 epoch = int((iteration + 1) / iterations_epoch) for i in workers: models[i].train() g_list = [] for i in workers: try: data, target = next(train_data_iter_list[i - 1]) except StopIteration: train_data_iter_list[i - 1] = iter(train_data_list[i - 1]) data, target = next(train_data_iter_list[i - 1]) data, target = Variable(data), Variable(target) optimizers_list[i - 1].zero_grad() output = models[i](data) loss = criterion(output, target) loss.backward() delta_ws = optimizers_list[i - 1].get_delta_w() g_list.append(delta_ws) iteration_loss += loss.data.item() / workers_num epoch_train_loss += iteration_loss g_q_list = [] for g in g_list: g_quantization, compression_ratio = quantization(g, args.bit) g_q_list.append(g_quantization) # 同步操作 g_avg = [] for p_idx, param in enumerate(models[0].parameters()): global_update_layer = torch.zeros_like(param.data) for w in workers: global_update_layer += g_q_list[w - 1][p_idx] tensor = global_update_layer / workers_num g_avg.append(tensor) param.data -= tensor pull_workers = 0 pull_workers_list = pull_judge(workers_num, args.ratio) for w in workers: isPulling = w in pull_workers_list if isPulling: pull_workers += 1 for p_idx, param in enumerate(models[0].parameters()): if isPulling: list(models[w].parameters())[p_idx].data = param.data else: list(models[w].parameters())[p_idx].data -= g_q_list[ w - 1][p_idx] print('Epoch {}, Loss:{}'.format(epoch, loss.data.item())) total_pulling_ratio += pull_workers / workers_num epoch_avg_pull_ratio += pull_workers / workers_num f_log.write( str(args.this_rank) + "\t" + str(iteration_loss) + "\t" + str(epoch) + "\t" + str(pull_workers / workers_num) + # the ratio of pulling workers "\t" + str(iteration) + "\t" + str(pull_workers_list) + '\n') f_log.flush() # train loss every epoch if iteration % iterations_epoch == 0: # 训练结束后进行test if iteration % (2 * iterations_epoch) == 0: test_loss, test_acc = test_model(0, model, test_data, criterion=criterion) f_trainloss.write( str(args.this_rank) + "\t" + str(epoch_train_loss / float(clock_epoch)) + "\t" + str(test_loss) + "\t" + str(test_acc) + "\t" + str(total_pulling_ratio) + # accumulated pulling ratio of workers "\t" + str(epoch) + "\t" + str(epoch_avg_pull_ratio / clock_epoch) + # the avg ratio of pulling workers in an epoch "\t" + str(iteration) + "\t" + str(total_time) + # time '\n') f_trainloss.flush() epoch_train_loss = 0.0 epoch_avg_pull_ratio = 0.0 clock_epoch = 0 for i in workers: if (epoch + 1) % decay_period == 0: for param_group in optimizers_list[i - 1].param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format( param_group['lr'])) f_log.close() f_trainloss.close()
def run(rank, workers, model, save_path, train_data, test_data, global_lr): # Get the initial model from the server print(workers) _group = [w for w in workers].append(0) group = dist.new_group(_group) for p in model.parameters(): tmp_p = torch.zeros_like(p) dist.scatter(tensor=tmp_p, src=0, group=group) p.data = tmp_p print('Model recved successfully!') temp_lr = global_lr.get() if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: optimizer = MySGD(model.parameters(), lr=temp_lr) else: optimizer = MySGD(model.parameters(), lr=temp_lr) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() else: criterion = torch.nn.CrossEntropyLoss() print('Begin!') # the parameters that will be transferred to the thread model_cache = [p.data + 0.0 for p in model.parameters()] global_update = [torch.zeros_like(p) for p in model.parameters()] local_update = [torch.zeros_like(p) for p in model.parameters()] it_count = Value(c_float, 0.) # count update times in an iteration by local worker data_lock = Lock() update_lock = Queue() update_lock.put(1) loss_t = torch.tensor(0.0) receive_end = Value(c_bool, False) batch_communication_interval = Value(c_float, 0.0) stale_in_iteration = Value(c_float, 0.) sender_td = Thread(target=sender, args=( model_cache, global_update, local_update, it_count, loss_t, update_lock, data_lock, group, receive_end, batch_communication_interval, stale_in_iteration, ), daemon=True) sender_td.start() time_logs = open("./record" + str(rank), 'w') osp_logs = open("./log" + str(rank), 'w') Stale_Threshold = args.stale_threshold for epoch in range(args.epochs): batch_interval = 0.0 batch_comp_interval = 0.0 s_time = time.time() model.train() # Decay the learning at the specific epoch # learning rate should be decreased on server due to unmatched updating speed between local worker and server if not global_lr.empty(): g_lr = global_lr.get() if args.model == 'AlexNet': for param_group in optimizer.param_groups: param_group['lr'] = g_lr print('LR Decreased! Now: {}'.format(param_group['lr'])) for batch_idx, (data, target) in enumerate(train_data): batch_start_time = time.time() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() delta_ws = optimizer.get_delta_w() optimizer.step() # Aggregate local update data_lock.acquire() # aggregate loss loss_t.data += loss.data it_count.value += 1 for g_idx, update in enumerate(local_update): update.data += delta_ws[g_idx].data data_lock.release() batch_computation_time = time.time() # Open the lock once the local update has at least one gradient if it_count.value == 1: update_lock.put(1) while it_count.value >= Stale_Threshold: pass if receive_end.value: receive_end.value = False for idx, param in enumerate(model.parameters()): param.data = model_cache[idx] # without local update # param.data = model_cache[idx] - global_update[idx] # with local update batch_end_time = time.time() batch_interval += batch_end_time - batch_start_time batch_comp_interval += batch_computation_time - batch_start_time osp_logs.write( str(batch_end_time - batch_start_time) + "\t" + str(batch_computation_time - batch_start_time) + "\n") osp_logs.flush() print('Rank {}, Epoch {}, Loss:{}'.format(rank, epoch, loss.data.item())) e_time = time.time() # 训练结束后进行test #test_loss, acc = test_model(rank, model, test_data, criterion=criterion) acc = 0.0 batch_interval /= batch_idx batch_comp_interval /= batch_idx logs = torch.tensor([ acc, batch_interval, batch_comp_interval, batch_communication_interval.value, stale_in_iteration.value ]) time_logs.write(str(logs) + '\n') time_logs.flush() # dist.gather(tensor=logs, dst = 0, group = group) time_logs.close() sender_td.join()
def run(workers, models, save_path, train_data_list, test_data, ntokens, train_batch_size): workers_num = len(workers) print('Model recved successfully!') optimizers_list = [] if args.lr == 0.0: if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: learning_rate = 0.1 else: learning_rate = 0.01 else: learning_rate = args.lr for i in workers: optimizer = MySGD(models[i].parameters(), lr=learning_rate) optimizers_list.append(optimizer) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 500 else: decay_period = 200 data_save = pd.DataFrame(columns=[ 'Training Round', 'Training Loss', 'Training Perplexity', 'Test Loss', 'Test Perplexity' ]) print('Begin!') # store (train loss, energy, iterations) trainloss_file = args.save_path + '/trainloss' + args.model + '.txt' if (os.path.isfile(trainloss_file)): os.remove(trainloss_file) # 删掉已有同名文件 f_trainloss = open(trainloss_file, 'a') iterations_num_epoch = 0 sequence_iter = range(0, train_data_list[workers_num - 1].size(0) - 1, args.bptt) hidden_list = [] for i in workers: hidden = models[i].init_hidden(train_batch_size) hidden_list.append(hidden) gamma = args.gamma first_label = True epoch_train_loss = 0.0 test_loss = 10.0 g_list = [] for i in workers: g_temp = [torch.zeros_like(p.data) for p in models[0].parameters()] g_list.append(g_temp) it_count = 0 s_time = time.time() epoch_loss = [] for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() for i in workers: models[i].train() iterations_epoch = 0 user_loss = [] for j in workers: sequence_iter = range(0, train_data_list[j - 1].size(0) - 1, args.bptt) batch_loss = [] for batch, i in enumerate(sequence_iter): it_count += 1 iterations_epoch += 1 iteration_loss = 0.0 print('Start Batch {} / {}'.format(batch, len(sequence_iter))) data, targets = get_batch(train_data_list[j - 1], i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden_list[j - 1] = repackage_hidden(hidden_list[j - 1]) optimizers_list[j - 1].zero_grad() output, hidden_list[j - 1] = models[j](data, hidden_list[j - 1]) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm_` helps prevent the exploding gradient problem in RNNs / LSTMs. clip_grad_norm_(models[j].parameters(), 0.25) delta_ws = optimizers_list[j - 1].get_delta_w() # update local model and cache gradient into list for p_layer_idx, p_layer_temp in enumerate( models[j].parameters()): p_layer_temp.data -= delta_ws[p_layer_idx] g_list[j - 1][p_layer_idx].data += delta_ws[p_layer_idx] iteration_loss += loss.data.item() # worker i 当前round的平均loss batch_loss.append(loss.data.item()) user_loss.append(sum(batch_loss) / len(batch_loss)) epoch_train_loss += iteration_loss / workers_num epoch_loss.append(sum(user_loss) / len(user_loss)) if epoch % args.K == 0: # Synchronization for p_idx, param in enumerate(models[0].parameters()): # in each worekr: update local model with the pulled global model and local update for w in workers: if args.type == 'LOSP': list(models[w].parameters( ))[p_idx].data = param.data - gamma * g_list[w - 1][p_idx] elif args.type == 'OSP': list(models[w].parameters() )[p_idx].data = param.data + torch.zeros_like( param.data) else: pass # # in cloud : update global parameter with the average of all updates # global_update_layer = torch.zeros_like(param.data) # for w in workers: # global_update_layer += g_list[w-1][p_idx] # tensor = global_update_layer / workers_num # param.data -= tensor # in cloud : update global parameter with the average of all updates for w in workers: param.data -= g_list[w - 1][p_idx] / workers_num # in each worekr: update local model with the pulled global model and local update for w in workers: if args.type == 'KAVG': list(models[w].parameters() )[p_idx].data = param.data + torch.zeros_like( param.data) else: pass g_list = [] for w in workers: g_temp = [ torch.zeros_like(p.data) for p in models[0].parameters() ] g_list.append(g_temp) e_time = time.time() # train loss every epoch print('Epoch {}, Loss:{}'.format(epoch, loss.data.item())) # 训练结束后进行test if epoch % 5 == 0: # Run on test data. train_loss = sum(epoch_loss) / len(epoch_loss) epoch_loss = [] test_loss = evaluate(models[0], ntokens, 10, test_data, criterion=criterion) data_save.append([{ 'Training Round': epoch, 'Training Loss': train_loss, 'Training Perplexity': math.exp(train_loss), 'Test Loss': test_loss, 'Test Perplexity': math.exp(test_loss) }]) data_save.to_csv('PTB_data.csv') print("test_loss:", test_loss) f_trainloss.write( str(args.this_rank) + "\t" + str(epoch_train_loss / float(iterations_epoch)) + "\t" + str(args.K) + # args.K "\t" + str(e_time - epoch_start_time) + # leave place for one epoch time "\t" + str(iterations_epoch) + # leave place for overall time "\t" + str(math.exp(test_loss)) + # leave place for perplexity "\t" + str(test_loss) + # leave place for test accuracy "\t" + str(e_time - s_time) + # leave place for total time "\t" + str(0) + # leave place for one iteration time of comp "\t" + str(it_count) + # leave place for one iteration time of comm "\t" + str(it_count / args.K) + #global iterations "\t" + str(epoch) + '\n') f_trainloss.flush() epoch_train_loss = 0.0 # 在指定epoch, gamma减半 # 可以自己定义策略 if (epoch + 1) > args.gamma_decay_epoch: if first_label: gamma = 0.01 first_label = False if (epoch + 1) % decay_period == 0: for i in workers: for param_group in optimizers_list[i - 1].param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format(param_group['lr'])) f_trainloss.close()
def run(workers, models, save_path, train_data_list, test_data, iterations_epoch): dev = torch.device('cuda') cpu = torch.device('cpu') param_num = 0 for p in models[0].parameters(): tmp_p = torch.zeros_like(p) param_num += torch.numel(tmp_p) models[0] = models[0].cuda(dev) for i in workers: models[i] = models[i].cuda(dev) workers_num = len(workers) print('Model recved successfully!') compression_num = int(param_num * args.ratio) compression_num = compression_num if compression_num > 0 else 1 optimizers_list = [] for i in workers: optimizer = MySGD(models[i].parameters(), lr=args.lr) # if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: # optimizer = MySGD(models[i].parameters(), lr=0.1) # elif args.model in ['VGG11']: # optimizer = MySGD(models[i].parameters(), lr=0.1) # else: # optimizer = MySGD(models[i].parameters(), lr=0.1) optimizers_list.append(optimizer) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() elif args.model in ['Abalone', 'Bodyfat', 'Housing']: criterion = torch.nn.MSELoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 10000 else: decay_period = 1000000 print('Begin!') # store (train loss, energy, iterations) # naming rules: title + model_name + number_of_workers trainloss_file = './../result/' + args.title + '_' + args.model + '_' + str(args.workers) + '.txt' if(os.path.isfile(trainloss_file)): os.remove(trainloss_file) f_trainloss = open(trainloss_file, 'a') train_data_iter_list = [] for i in workers: train_data_iter_list.append(iter(train_data_list[i-1])) global_clock = 0 g_remain_list = [] for i in workers: g_remain = [torch.zeros_like(param.data) for param in models[i].parameters()] g_remain_list.append(g_remain) # time_logs = open("./record" + str(rank), 'w') for epoch in range(args.epochs): iteration_loss = 0.0 # epoch_train_loss = 0 g_change_average = [torch.zeros_like(param.data).cuda(dev) for param in models[0].parameters()] global_clock += 1 for i in workers: try: data, target = next(train_data_iter_list[i-1]) except StopIteration: train_data_iter_list[i-1] = iter(train_data_list[i - 1]) data, target = next(train_data_iter_list[i-1]) data, target = Variable(data).cuda(dev), Variable(target).cuda(dev) optimizers_list[i-1].zero_grad() output = models[i](data) loss = criterion(output, target) loss.backward() delta_ws = optimizers_list[i-1].get_delta_w() iteration_loss += loss.data.item()/workers_num g_remain_list[i-1], g_large_change = get_upload(g_remain_list[i-1], delta_ws, args.ratio, args.isCompensate, dev) # synchronous update for g_change_layer_idx, g_change_layer in enumerate(g_change_average): g_change_layer.data += g_large_change[g_change_layer_idx].data/workers_num # 同步操作 for p_idx, param in enumerate(models[0].parameters()): param.data -= g_change_average[p_idx].data for w in workers: list(models[w].parameters())[p_idx].data = param.data # epoch_train_loss += iteration_loss # epoch = int(iteration / iterations_epoch) print('Epoch {}, Loss:{}'.format(epoch, loss.data.item())) # if (iteration+1) % iterations_epoch == 0: # 训练结束后进行test # test_loss, test_acc = test_model(0, model, test_data, criterion=criterion) # f_trainloss.write(str(args.this_rank) + # "\t" + str(iteration_loss) + # "\t" + str(0) + # "\t" + str(epoch) + # "\t" + str(0) + # "\t" + str(sparsification_ratio) + # time # "\t" + str(global_clock) + # time # '\n') f_trainloss.write(str(epoch) + '\t' + str(global_clock) + '\t' + str(iteration_loss) + '\t' + str(args.ratio) + '\n') f_trainloss.flush() # epoch_train_loss = 0.0 # 在指定epochs (iterations) 减少缩放因子 if (epoch + 1) in [0, 250000]: ratio = ratio * 0.1 print('--------------------------------') print(ratio) # for i in workers: # models[i].train() # if (epoch + 1) % decay_period == 0: # for param_group in optimizers_list[i - 1].param_groups: # param_group['lr'] *= 0.1 # print('LR Decreased! Now: {}'.format(param_group['lr'])) f_trainloss.close()
def run(workers, models, save_path, train_data_list, test_data, iterations_epoch): dev = torch.device('cuda') cpu = torch.device('cpu') start_time = time.time() models[0] = models[0].cuda(dev) for i in workers: models[i] = models[i].cuda(dev) workers_num = len(workers) print('Model recved successfully!') optimizers_list = [] for i in workers: optimizer = MySGD(models[i].parameters(), lr=args.lr) # if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: # optimizer = MySGD(models[i].parameters(), lr=0.1) # elif args.model in ['VGG11']: # optimizer = MySGD(models[i].parameters(), lr=0.1) # else: # optimizer = MySGD(models[i].parameters(), lr=0.1) optimizers_list.append(optimizer) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 10000 else: decay_period = 1000000 print('Begin!') # the several workers in the front of the rank list byzantine_workers_list = [w + 1 for w in range(args.byzantine)] # cache g_old_num old gradients g_old_num = args.loops g_old_list = [] for i in workers: worker_g_old_list = [[torch.zeros_like(param.data).cuda(dev) for param in model.parameters()] for _ in range(g_old_num)] g_old_list.append(worker_g_old_list) g_old_count = 0 global_g = [torch.zeros_like(param.data).cuda(dev) for param in model.parameters()] # store (train loss, energy, iterations) # naming rules: title + model_name + number_of_workers trainloss_file = './mytopk' \ + args.title \ + '_' + args.method \ + '_' + args.model \ + '_B' + str(args.byzantine) \ + '_V' + str(int(args.V)) \ + '_E' + str(args.loops) \ + '_R' + str(int(args.ratio * 1000)) \ + '_al' + str(int(args.alpha * 1000)) \ + '_be' + str(int(args.beta * 1000)) \ + '_W' + str(args.workers) + '.txt' if(os.path.isfile(trainloss_file)): os.remove(trainloss_file) f_trainloss = open(trainloss_file, 'a') train_data_iter_list = [] for i in workers: train_data_iter_list.append(iter(train_data_list[i-1])) epoch_train_loss = 0.0 global_clock = 0 g_remain_list = [] h_remain_list = [] h_last_list = [] ratio = args.ratio threshold = 0. g_change_list = [] for i in workers: g_change_list.append([torch.zeros_like(param.data).cuda(dev) for param in models[0].parameters()]) h_remain_list.append([torch.zeros_like(param.data).cuda(dev) for param in models[0].parameters()]) h_last_list.append([torch.zeros_like(param.data).cuda(dev) for param in models[0].parameters()]) for epoch in range(args.epochs): iteration_loss = 0.0 # g_change_average = [torch.zeros_like(param.data).cuda(dev) for param in models[0].parameters()] global_clock += 1 g_change_average_list = [[] for _ in range(workers_num)] for i in workers: try: data, target = next(train_data_iter_list[i-1]) except StopIteration: train_data_iter_list[i-1] = iter(train_data_list[i - 1]) data, target = next(train_data_iter_list[i-1]) data, target = Variable(data).cuda(dev), Variable(target).cuda(dev) optimizers_list[i-1].zero_grad() output = models[i](data) loss = criterion(output, target) loss.backward() delta_ws = optimizers_list[i-1].get_delta_w() iteration_loss += loss.data.item()/workers_num # update old gradient list g_new = [] for layer_g in delta_ws: layer_g_tmp = torch.zeros_like(layer_g).cuda(dev) layer_g_tmp += layer_g g_new.append(layer_g_tmp) g_old_list[i-1].append(g_new) # cache new gradient g_old_list[i-1].pop(0) # count the number of gradient g_old_count = min(g_old_count+1, g_old_num) # g_old_count += 1 # if g_old_count > g_old_num: # g_old_count = g_old_num if global_clock == 1: g_remain = [torch.zeros_like(g_layer).cuda(dev)+g_layer for g_layer in delta_ws] g_remain_list.append(g_remain) # synchronous update # the gradient change in the first iteration is gradient itself for g_change_layer_idx, g_change_layer in enumerate(g_change_list[i - 1]): g_change_layer.data += delta_ws[g_change_layer_idx].data g_change_average_list[i - 1].append(g_change_layer.data) sparsification_ratio = 1.0 else: update_new = [] for layer_idx, layer_g in enumerate(delta_ws): layer_update_new_tmp = torch.zeros_like(layer_g).cuda(dev) for g_old in g_old_list[i-1]: layer_update_new_tmp += g_old[layer_idx] layer_update_new_tmp /= g_old_count update_new.append(layer_update_new_tmp) # print(g_old_count) # g_remain, g_large_change, sparsification_ratio= get_upload(g_remain_list[i-1],update_new,ratio,args.isCompensate, threshold, dev) new_g_avg = [torch.zeros_like(g_layer) + g_layer for g_layer in update_new] for idx, g_layer in enumerate(update_new): new_g_avg[idx] += args.alpha * (h_last_list[i - 1][idx] - h_remain_list[i - 1][idx]) g_remain, g_large_change = get_upload_topk(g_remain_list[i - 1], new_g_avg, args.ratio, args.isCompensate, dev) g_remain_list[i - 1] = g_remain # if i in byzantine_workers_list: # g_large_change = byzantine_func(g_large_change, dev) h_remain_list[i - 1] = h_last_list[i - 1] h_last_list[i - 1] = [torch.zeros_like(g_layer) for g_layer in update_new] for idx, g_layer in enumerate(update_new): h_last_list[i - 1][idx] = h_remain_list[i - 1][idx] * args.beta h_last_list[i - 1][idx] -= (update_new[idx] - g_remain[idx]) for g_change_layer_idx, g_change_layer in enumerate(g_change_list[i - 1]): g_change_layer.data += g_large_change[g_change_layer_idx].data g_change_average_list[i - 1].append(g_change_layer) # if i in byzantine_workers_list: # g_remain_list[i - 1] = g_change_list[i - 1] # for g_change_layer_idx, g_change_layer in enumerate(g_change_list[i - 1]): # g_remain_list[i - 1][g_change_layer_idx] = g_change_layer + torch.zeros_like(g_change_layer).cuda(dev) # if i in byzantine_workers_list: # g_change_layer.data += g_large_change[g_change_layer_idx].data # g_change_layer.data += args.V * torch.randn_like(g_change_layer.data).data # else: # g_change_layer.data += g_large_change[g_change_layer_idx].data # g_change_average_list[i - 1].append(g_change_layer.data) # if i in byzantine_workers_list: # by_list = byzantine_func(g_change_list[i - 1], dev) # for g_change_layer in by_list: # g_change_average_list[i - 1].append(g_change_layer) # else: # for g_change_layer in g_change_list[i - 1]: # g_change_average_list[i - 1].append(g_change_layer) # non_byz_g = [] # for p_idx, param in enumerate(models[0].parameters()): # global_update_layer = torch.zeros_like(param.data).cuda(dev) # for w in workers: # if w not in byzantine_workers_list: # global_update_layer += g_change_average_list[w - 1][p_idx] # tensor = global_update_layer / (workers_num - args.byzantine) # non_byz_g.append(tensor) # non_byz_g = byzantine_func(non_byz_g, dev) # for i in workers: # if i in byzantine_workers_list: # g_change_average_list[i - 1] = [] # for g_change_layer in non_byz_g: # g_change_average_list[i - 1].append(g_change_layer + torch.zeros_like(g_change_layer).cuda(dev)) # 同步操作 if args.method == "Mean": g_median = mean(g_change_average_list, workers, dev) elif args.method == "TrimmedMean": # if args.T > 0 and args.T < workers_num/2: # beta = args.T # else: # beta = int((workers_num-1)/2) g_median = trimmed_mean(g_change_average_list, workers, args.byzantine, dev) elif args.method == "Median": g_median = median_defense(g_change_average_list, workers, dev) elif args.method == "FABA": g_median = FABA(g_change_average_list, workers, args.byzantine, dev) elif args.method == "Krum": g_median = Krum(g_change_average_list, workers, args.byzantine, dev) g_quare_sum = 0.0 # for threshold for p_idx, param in enumerate(models[0].parameters()): param.data -= g_median[p_idx].data # print(g_median[p_idx].data) for w in workers: list(models[w].parameters())[p_idx].data = param.data + torch.zeros_like(param.data).cuda(dev) g_quare_sum += torch.sum(g_median[p_idx].data * g_median[p_idx].data) g_quare_sum = torch.sqrt(g_quare_sum).cuda(dev) threshold = g_quare_sum.data.item() # epoch_train_loss += iteration_loss # epoch = int(iteration / iterations_epoch) current_time = time.time() - start_time test_acc = 0 if epoch % 50 >= 45: test_acc = test_model(0, models[1], test_data, dev) print('Epoch {}, Time:{}, Loss:{}'.format(epoch, current_time, iteration_loss)) f_trainloss.write(str(epoch) + '\t' + str(current_time) + '\t' + str(iteration_loss) + '\t' + str(sparsification_ratio) + # '\t' + str(test_loss) + '\t' + str(test_acc) + '\n') f_trainloss.flush() # if (iteration+1) % iterations_epoch == 0: # 训练结束后进行test # test_loss, test_acc = test_model(0, model, test_data, criterion=criterion) # f_trainloss.write(str(args.this_rank) + # "\t" + str(epoch_train_loss / float(iterations_epoch)) + # "\t" + str(iteration_loss) + # "\t" + str(0) + # "\t" + str(epoch) + # "\t" + str(0) + # "\t" + str(iteration) + # "\t" + str(sparsification_ratio) + # time # "\t" + str(global_clock) + # time # '\n') # epoch_train_loss = 0.0 # 在指定epochs (iterations) 减少缩放因子 # if (epoch + 1) in [0, 250000]: # ratio = ratio * 0.1 # print('--------------------------------') # print(ratio) # for i in workers: # models[i].train() # if (epoch + 1) % decay_period == 0: # for param_group in optimizers_list[i - 1].param_groups: # param_group['lr'] *= 0.1 # print('LR Decreased! Now: {}'.format(param_group['lr'])) f_trainloss.close()
def run(workers, models, save_path, train_data_list, test_data, iterations_epoch): workers_num = len(workers) print('Model recved successfully!') optimizers_list = [] for i in workers: if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: optimizer = MySGD(models[i].parameters(), lr=0.1) else: optimizer = MySGD(models[i].parameters(), lr=0.01) optimizers_list.append(optimizer) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 500 else: decay_period = 1000000 print('Begin!') global_g = [torch.zeros_like(param.data) for param in model.parameters()] # store (train loss, energy, iterations) trainloss_file = './trainloss_oldsimu' + args.model + '_w15r1lr0.1.txt' if (os.path.isfile(trainloss_file)): os.remove(trainloss_file) f_trainloss = open(trainloss_file, 'a') train_data_iter_list = [] for i in workers: train_data_iter_list.append(iter(train_data_list[i - 1])) epoch_train_loss = 0.0 global_clock = 0 g_remain_list = [] ratio = args.ratio threshold = 0. print("Begin for") for iteration in range(args.epochs * iterations_epoch): iteration_loss = 0.0 g_list = [] g_change_average = [ torch.zeros_like(param.data) for param in models[0].parameters() ] global_clock += 1 for i in workers: # zheli try: data, target = next(train_data_iter_list[i - 1]) except StopIteration: train_data_iter_list[i - 1] = iter(train_data_list[i - 1]) data, target = next(train_data_iter_list[i - 1]) data, target = Variable(data), Variable(target) optimizers_list[i - 1].zero_grad() output = models[i](data) loss = criterion(output, target) loss.backward() delta_ws = optimizers_list[i - 1].get_delta_w() # zheli gzhi g_list.append(delta_ws) iteration_loss += loss.data.item() / workers_num if global_clock == 1: # chushihua diyilun g_remain = [ torch.zeros_like(g_layer) + g_layer for g_layer in delta_ws ] g_remain_list.append(g_remain) test_loss = str( 0) ###################################################### test_acc = str( 0) ###################################################### # synchronous update # the gradient change in the first iteration is gradient itself for g_change_layer_idx, g_change_layer in enumerate( g_change_average): g_change_layer.data += delta_ws[ g_change_layer_idx].data / workers_num sparsification_ratio = 1.0 else: # print(delta_ws) # 2 huge? g_remain, g_large_change, sparsification_ratio = get_upload( g_remain_list[i - 1], delta_ws, ratio, args.isCompensate, threshold) #hanshu g_remain_list[i - 1] = g_remain # server g gengxin # synchronous update for g_change_layer_idx, g_change_layer in enumerate( g_change_average): # line5 g_change_layer.data += g_large_change[ g_change_layer_idx].data / workers_num #line5 # 同步操作 g_quare_sum = 0.0 # for threshold server for p_idx, param in enumerate(models[0].parameters()): global_g[p_idx].data += g_change_average[ p_idx].data #zheli delta pingjun param.data -= global_g[p_idx].data for w in workers: #gengxin w list(models[w]. parameters())[p_idx].data = param.data + torch.zeros_like( param.data) #hui chuan g_quare_sum += torch.sum(global_g[p_idx].data * global_g[p_idx].data) #server buxishuo g_quare_sum = torch.sqrt(g_quare_sum) threshold = g_quare_sum.data.item() epoch_train_loss += iteration_loss epoch = int(iteration / iterations_epoch) print('Epoch {}, Loss:{}'.format(epoch, loss.data.item())) if True: # #''' if (iteration + 1) % iterations_epoch == 0: # 训练结束后进行test test_loss, test_acc = test_model(0, model, test_data, criterion=criterion) epoch_train_loss = 0.0 #''' f_trainloss.write( str(args.this_rank) + "\t" + str(epoch_train_loss / float(iterations_epoch)) + "\t" + str(iteration_loss) + "\t" + str(0) + "\t" + str(epoch) + "\t" + str(0) + "\t" + str(iteration) + "\t" + str(sparsification_ratio) + # time "\t" + str(global_clock) + # time "\t" + str(test_loss) + # test_loss "\t" + str(test_acc) + # test_acc '\n') f_trainloss.flush() #epoch_train_loss = 0.0 # 在指定epochs (iterations) 减少缩放因子 if (epoch + 1) in [0, 1000]: ratio = ratio * 0.1 print('--------------------------------') print(ratio) for i in workers: models[i].train() if (epoch + 1) % decay_period == 0: for param_group in optimizers_list[i - 1].param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format( param_group['lr'])) f_trainloss.close()
def run(rank, model, train_data, test_data, queue, param_q, stop_flag): # 获取ps端传来的模型初始参数 while True: if not param_q.empty(): param_dict = param_q.get() tmp = OrderedDict( map(lambda item: (item[0], torch.from_numpy(item[1])), param_dict.items())) model.load_state_dict(tmp) break print('Model recved successfully!') if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: optimizer = MySGD(model.parameters(), lr=0.1) else: optimizer = MySGD(model.parameters(), lr=0.01) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 50 else: decay_period = 100 print('Begin!') time_logs = open("./record" + str(rank), 'w') for epoch in range(int(args.epochs)): batch_interval = 0.0 batch_comp_interval = 0.0 batch_comm_interval = 0.0 batch_push_interval = 0.0 batch_pull_interval = 0.0 model.train() # AlexNet在指定epoch减少学习率LR #if args.model == 'AlexNet': if (epoch + 1) % decay_period == 0: for param_group in optimizer.param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format(param_group['lr'])) epoch_train_loss = 0 for batch_idx, (data, target) in enumerate(train_data): batch_start_time = time.time() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() delta_ws = optimizer.get_delta_w() batch_comp_time = time.time() # noinspection PyBroadException try: # 捕获异常,异常来源于ps进程的停止 if delta_ws: queue.put({ rank: [loss.data.numpy(), np.array(args.train_bsz), False] }) for delta in delta_ws: dist.send(tensor=delta, dst=0) batch_push_time = time.time() for idx, param in enumerate(model.parameters()): tmp_tensor = torch.zeros_like(param.data) dist.recv(tensor=tmp_tensor, src=0) param.data = tmp_tensor batch_tmp_time = time.time() batch_pull_time = time.time() #print('Rank {}, Epoch {}, Batch {}/{}, Loss:{}' # .format(rank, epoch, batch_idx, len(train_data), loss.data[0])) except Exception as e: print(str(e)) print('Should Stop: {}!'.format(stop_flag.value)) break batch_interval += batch_pull_time - batch_start_time batch_comp_interval += batch_comp_time - batch_start_time batch_comm_interval += batch_pull_time - batch_comp_time batch_push_interval += batch_push_time - batch_comp_time batch_pull_interval += batch_pull_time - batch_push_time b_interval = batch_interval / (batch_idx + 1) b_comp_interval = batch_comp_interval / (batch_idx + 1) b_comm_interval = batch_comm_interval / (batch_idx + 1) b_push_interval = batch_push_interval / (batch_idx + 1) b_pull_interval = batch_pull_interval / (batch_idx + 1) logs = torch.tensor([ 0.0, b_interval, b_comp_interval, b_comm_interval, b_push_interval, b_pull_interval, batch_pull_time - batch_tmp_time ]) time_logs.write(str(logs) + '\n') time_logs.flush() batch_interval /= batch_idx batch_comp_interval /= batch_idx batch_comm_interval /= batch_idx batch_push_interval /= batch_idx batch_pull_interval /= batch_idx logs = torch.tensor([ 0.0, batch_interval, batch_comp_interval, batch_comm_interval, batch_push_interval, batch_pull_interval ]) time_logs.write(str(epoch) + '\t' + str(logs) + '\n') time_logs.flush() # 训练结束后进行test print("test Model:", epoch) # test_model(rank, model, test_data, criterion=criterion) if stop_flag.value: break queue.put({rank: [[], [], True]}) time_logs.close() print("Worker {} has completed epoch {}!".format(args.this_rank, epoch))