def get_accu_and_loss(ps, args): net = model.SimpleCNN(args) mnist = model.download_mnist_retry(seed=1111) value = [] cents = [] begin = ray.get(ps.get_begin.remote()) while not begin: time.sleep(0.001) begin = ray.get(ps.get_begin.remote()) start_time = time.time() while True: the_time = time.time() - start_time cent = ray.get(ps.get_weights.remote()) cents.append((the_time, cent)) print('number of items in the cents', len(cents)) time.sleep(1) if the_time > 5: cent_time, cent = cents.pop(0) net.set_flat(cent) xs, xy = mnist.test.next_batch(10000) accu, loss = net.compute_accuracy_and_loss(xs, xy) print() print('centralized_time', cent_time, 'accu:', accu, 'loss:', loss) print() value.append((cent_time, accu, loss)) np.save( args.save_dir + 'centralized_num_worker%d, round %d' % (args.num_workers, args.round), np.array(value))
def __init__(self, worker_index, batch_size=50): self.worker_index = worker_index self.batch_size = batch_size self.mnist = input_data.read_data_sets("MNIST_data", one_hot=True, seed=worker_index) self.net = model.SimpleCNN()
def get_accu_and_loss(ps, args): net = model.SimpleCNN(args) mnist = model.download_mnist_retry(seed=1111) start_time = time.time() value = [] master_weights = [] current_time = time.time() - start_time while current_time < args.stop_time: weights = ray.get(ps.get_master_weight.remote()) master_weights.append((current_time, weights)) if current_time > 5: t, w = master_weights.pop(0) net.set_flat(w) xs, xy = mnist.test.next_batch(2000) accu, loss = net.compute_accuracy_and_loss(xs, xy) print() # print(['*']*10) print('master_time', t, 'accu:', accu, 'testing loss:', loss) # print(['*']*10) print() value.append((t, accu, loss)) np.save( args.save_dir + 'federated_num_worker%d, k_%d, round_%d, net_lrn_%.6f, FL_lrn_%6f' % (args.num_workers, args.k, args.round, args.net_lrn, args.lrns[0]), np.array(value)) time.sleep(1) current_time = time.time() - start_time
def worker_task(ps, worker_index, batch_size=50): # Download MNIST. mnist = model.download_mnist_retry(seed=worker_index) # Initialize the model. net = model.SimpleCNN() keys = net.get_weights()[0] while True: # Get the current weights from the parameter server. weights = ray.get(ps.pull.remote(keys)) net.set_weights(keys, weights) # Compute an update and push it to the parameter server. xs, ys = mnist.train.next_batch(batch_size) gradients = net.compute_update(xs, ys) ps.push.remote(keys, gradients)
def get_accu_and_loss(ps, args): net = model.SimpleCNN(args) mnist = model.download_mnist_retry(seed=1111) # before we start the training, check all the loss value is set which means all workers are ready while True: losses = ray.get(ps.get_loss.remote()) if None not in losses: print("begin") start_time = time.time() break else: time.sleep(0.0001) value = [] cents = [] current_time = time.time() - start_time while current_time < args.stop_time: all_weights_ids = ray.get(ps.get_weights_ids.remote()) all_weights = np.array( [ray.get(all_weights_ids[i]) for i in range(args.num_workers)]) cent = np.mean(all_weights, axis=0) cents.append((current_time, cent)) if current_time > 5: cent_time, cent = cents.pop(0) net.set_flat(cent) xs, xy = mnist.test.next_batch(10000) accu, loss = net.compute_accuracy_and_loss(xs, xy) print() # print(['*']*10) print('cent_time', cent_time, 'accu:', accu, 'testing loss:', loss) # print(['*']*10) print() value.append((cent_time, accu, loss)) np.save( args.save_dir + 'flocking_num_worker%d, k_%d, round_%d, net_lrn_%.6f, node0_lrn_%6f, attraction_%.4f_center_v1' % (args.num_workers, args.k, args.round, args.net_lrn, args.lrns[0], args.a), np.array(value)) time.sleep(1) current_time = time.time() - start_time
def main(args): # Create a parameter server with some random weights. net = model.SimpleCNN() all_keys, all_values = net.get_weights() ps = ParameterServer.remote(all_keys, all_values) # Start some training tasks. worker_tasks = [worker_task.remote(ps, i) for i in range(args.num_workers)] # Download MNIST. mnist = model.download_mnist_retry() i = 0 while True: # Get and evaluate the current model. current_weights = ray.get(ps.pull.remote(all_keys)) net.set_weights(all_keys, current_weights) test_xs, test_ys = mnist.test.next_batch(1000) accuracy = net.compute_accuracy(test_xs, test_ys) print("Iteration {}: accuracy is {}".format(i, accuracy)) i += 1 time.sleep(1)
def worker_task(ps, current_worker_index, args): # Download MNIST. mnist = model.download_mnist_retry(seed=current_worker_index + 1) # Initialize the model. args.lrn = args.lrns[current_worker_index] net = model.SimpleCNN(args) if current_worker_index == 1: xs, ys = mnist.train.next_batch(args.batch_sizes[current_worker_index]) acc, loss = net.compute_accuracy_and_loss(xs, ys) stored_losses = [loss] step = 0 start_time = time.time() pre_time = time.time() while step < args.steps and time.time() - start_time < args.stop_time: time.sleep( max( 0, args.time_per_batch[current_worker_index] - (time.time() - pre_time))) pre_time = time.time() weights = ray.get(ps.get_master_weight.remote()) # Get the current weights from the parameter server. net.set_flat(weights) # Compute an update and push it to the parameter server. xs, ys = mnist.train.next_batch(args.batch_sizes[current_worker_index]) loss_value, new_weights = net.minimize(xs, ys) diff = new_weights - weights if current_worker_index != 0: time.sleep(0.1) if step % 50 == 0: print("step", step, "current_worker_index", current_worker_index, "elapsed time is", time.time() - start_time, "loss is", loss_value) ps.set_master_weight.remote(diff) step += 1
def __init__(self, worker_index, args): self.worker_index = worker_index self.batch_size = args.batch_size self.mnist = model.download_mnist_retry(seed=worker_index) self.net = model.SimpleCNN(args)
def __init__(self, args): self.net = model.SimpleCNN(args) self.begin = False
print('centralized_time', cent_time, 'accu:', accu, 'loss:', loss) print() value.append((cent_time, accu, loss)) np.save( args.save_dir + 'centralized_num_worker%d, round %d' % (args.num_workers, args.round), np.array(value)) if __name__ == "__main__": args = parser.parse_args() ray.init() args.save_dir = './centralized_log_%.1f/' % args.sleep_mean os.makedirs(args.save_dir) if not os.path.exists(args.save_dir) else None # Create a parameter server. net = model.SimpleCNN(args) ps = ParameterServer.remote(args) # Create workers. workers = [ Worker.remote(worker_index, args) for worker_index in range(args.num_workers) ] # Download MNIST. mnist = model.download_mnist_retry() i = 0 current_weights = ps.get_weights.remote() get_accu_and_loss.remote(ps, args) start_t = time.time()
#test_num = 100 args = sys.argv img_file = args[1] #numpyで入力データの設定 train_img = cv2.imread(img_file) / 255.0 train_img = np.transpose(train_img, (2, 0, 1)) x_np = np.reshape(train_img, (-1, 3, 128, 128)) #numpy配列をpytorchで扱うtensorに変換 x = torch.from_numpy(x_np).float() #model.pyに定義したモデルのインスタンスを作成しパラメータのロード #net = model.SimpleMLP() net = model.SimpleCNN() net.load_state_dict( torch.load("learning_result/parameters_epoch29", map_location=torch.device('cpu'))) if gpu_available: net = net.to("cuda:" + str(gpu_number)) print("cuda available") y = net(x) y = y.detach().numpy() y = np.reshape(y, (3, 128, 128)) y = np.transpose(y, (1, 2, 0)) y = np.fmin(y, 1) y = np.fmax(y, 0) y = (y * 255).astype(np.uint8)
def __init__(self, learning_rate): self.net = model.SimpleCNN(learning_rate=learning_rate)
def worker_task(ps, current_worker_index, args): mnist = model.download_mnist_retry(seed=current_worker_index + 1) # Initialize the model. args.lrn = args.lrns[current_worker_index] net = model.SimpleCNN(args) xs, ys = mnist.train.next_batch(args.batch_size[current_worker_index]) loss_value, _ = net.minimize(xs, ys) all_weights_ids = ray.get(ps.get_weights_ids.remote()) new_weights = ray.get(all_weights_ids[current_worker_index]) net.set_flat(new_weights) ps.set_loss.remote(current_worker_index, loss_value) # before we start the training, check all the loss value is set which means all workers are ready while True: losses = ray.get(ps.get_loss.remote()) if None not in losses: print("begin") start_time = time.time() break else: time.sleep(0.0001) flocking_group = ray.get(ps.get_graph.remote())[current_worker_index] step = 0 def get_flocking_potential(weights): all_weights_ids = ray.get(ps.get_weights_ids.remote()) flocking_dis = [] for fw in flocking_group: w = ray.get(all_weights_ids[fw]) # check whether there is nan in the weights. For debugging purpose # if np.isnan(np.min(w)): # print('\n\n\n\n\n\n\n\n\n\nthere is nan in weights') # print(ray.get(all_weights_ids[fw])) # print('fw is', fw) # print(weights) # print('current_worker_index is', current_worker_index) # return flocking_dis.append(weights - w) return np.sum(np.array(flocking_dis), axis=0) * args.a start_time = time.time() pre_time = time.time() next_weigth_save_time = start_time while step < args.steps and time.time() - start_time < args.stop_time: time.sleep( max( 0, args.time_per_batch[current_worker_index] - (time.time() - pre_time))) pre_time = time.time() xs, ys = mnist.train.next_batch(args.batch_size[current_worker_index]) loss_value, new_weights = net.minimize(xs, ys) ps.set_loss.remote(current_worker_index, loss_value) weights = new_weights f_p = get_flocking_potential(weights) new_weights = net.get_flat() new_weights -= args.lrn * f_p net.set_flat(new_weights) weights_id = ray.put(new_weights) ps.set_weights_ids.remote(current_worker_index, [weights_id]) step += 1 # if step % 100 == 0 and current_worker_index == 0: if step % 100 == 1: print('step', step, 'current_worker_index', current_worker_index, 'elapsed_time', time.time() - start_time, 'training loss is', loss_value) save = True if save: os.makedirs(args.save_dir + "saved_weight/", exist_ok=True) if time.time() > next_weigth_save_time: saved_weight = [time.time() - start_time, new_weights] np.save( args.save_dir + 'saved_weight/flocking_num_worker%d, k_%d, round_%d, net_lrn_%.6f, node0_lrn_%6f, attraction_%.4f_worker_%d_time_%.2f' % (args.num_workers, args.k, args.round, args.net_lrn, args.lrns[0], args.a, current_worker_index, time.time() - start_time), np.array(saved_weight)) next_weigth_save_time = time.time() + get_sleep_time(time.time() - start_time)
ray.init(num_cpus=args.num_workers + 1) args.save_dir = './network_regularized_log/' args.time_per_batch = {i: 1 for i in range(args.num_workers)} args.time_per_batch[0] = 0.125 args.lrns = {i: args.net_lrn for i in range(args.num_workers)} # if args.personal_lrn: # args.lrns[0] = args.net_lrn/8 print(f'learning rates are {args.lrns}') args.batch_size = {i: 64 for i in range(args.num_workers)} args.batch_size[0] = 1 os.makedirs(args.save_dir) if not os.path.exists(args.save_dir) else None tmp_args = args tmp_args.lrn = args.net_lrn net = model.SimpleCNN(tmp_args) init_weight = net.get_flat() print('\n\n shape is', init_weight.shape, '\n\n') weights = [init_weight for _ in range(args.num_workers)] weights_ids = [ray.put(w) for w in weights] graph = construct_graph_watts(args.num_workers, args.k, seed=args.graph_seed) print(graph) ps = ParameterServer.remote(num_workers=args.num_workers, weights_ids=weights_ids, graph=graph) worker_tasks = [ worker_task.remote(ps, i, args) for i in range(args.num_workers) ] get_accu_and_loss.remote(ps, args)