def __init__(self, optimizer, interval, name=None, use_locking=False): super(AdaptiveSGDOptimizer, self).__init__(optimizer, name, use_locking=use_locking) self._num_workers = current_cluster_size() self._rank = current_rank() self._step = tf.Variable(0, trainable=False, dtype=tf.int32) self._interval = interval
def train_mnist(sess, x, y_, train_op, test_op, optimizer, dataset, n_epochs=1, batch_size=5000): log_period = 100 # get the cluster size n_shards = current_cluster_size() # get the cluster rank of the node shard_id = current_rank() # calculate number of datapoints per node training_set_size = dataset['training_set']['x'].shape[0] shard_size = training_set_size // n_shards step_per_epoch = shard_size // batch_size n_steps = step_per_epoch * n_epochs print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps)) # KUNGFU: Each replica is responsible for a data shard. offset = batch_size * shard_id sess.run(tf.global_variables_initializer()) # KUNGFU: KungFu initilizer defines how model weights are initilised on distributed devices if hasattr(optimizer, 'distributed_initializer'): sess.run(optimizer.distributed_initializer()) print('training') # train the model with all batches allocated to the node for step in range(n_steps): xs = dataset['training_set']['x'][offset:offset + batch_size] y_s = dataset['training_set']['y'][offset:offset + batch_size] offset = (offset + batch_size * n_shards) % training_set_size sess.run(train_op, { x: xs, y_: y_s, }) # log the validation accuracy if step % log_period == 0: training_acc_dataset = dict() training_acc_dataset['x'] = xs training_acc_dataset['y'] = y_s result = test_mnist(sess, x, y_, test_op, training_acc_dataset) print('training accuracy: %f' % result) result = test_mnist(sess, x, y_, test_op, dataset['validation_set']) print('validation accuracy: %f' % result)
def apply_gradients(self, grads_and_vars, **kwargs): """Calls this same method on the underlying optimizer.""" np, rank = current_cluster_size(), current_rank() target = get_random_peer(np, rank) variables = [v for _g, v in grads_and_vars] other_peer_vars, save_model_op = self._build_request_and_save_ops( target, variables) assign_ops = [ tf.assign(v, 0.5 * (v + other_v)) for v, other_v in zip(variables, other_peer_vars) ] apply_op = self._optimizer.apply_gradients(grads_and_vars, **kwargs) with tf.control_dependencies(assign_ops): with tf.control_dependencies([apply_op]): with tf.control_dependencies([save_model_op]): return tf.group(apply_op)
def train_model(model, dataset, n_epochs=1, batch_size=5000): n_shards = current_cluster_size() shard_id = current_rank() train_data_size = len(dataset['x_train']) # calculate the offset for the data of the KungFu node shard_size = train_data_size // n_shards offset = batch_size * shard_id # extract the data for learning of the KungFu node x = dataset['x_train'][offset:offset + shard_size] y = dataset['y_train'][offset:offset + shard_size] # train the model model.fit(x, y, batch_size=batch_size, epochs=n_epochs, callbacks=[InitalizationCallback()], validation_data=(dataset['x_val'], dataset['y_val']), verbose=2)
def all_reduce_benchmark(sizes, dtype=tf.float32): xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes] tot_size = sum(_tensor_size(x) for x in xs) np = current_cluster_size() multiplier = 4 * (np - 1) print('all reduce total size: %s among %d peers' % (show_size(tot_size), np)) ys = group_all_reduce(xs) init = tf.global_variables_initializer() warmup_steps = 5 bench_steps = 10 with tf.Session() as sess: sess.run(init) for step in range(warmup_steps): sess.run(ys) for step in range(bench_steps): t0 = time.time() sess.run(ys) d = time.time() - t0 rate = 0 print('step %d, took %.2fs, equivalent data rate: %s' % (step, d, show_rate(tot_size * multiplier, d)))
def fake_get_shard_info(use_kungfu): if use_kungfu: from kungfu.ops import current_cluster_size, current_rank return current_rank(), current_cluster_size() return 0, 1
def show_info_example(): rank = current_rank() np = current_cluster_size() print('rank=%d, np=%d' % (rank, np))
def test_peer_info(): rank = current_rank() np = current_cluster_size() print('rank=%d, np=%d' % (rank, np))
return gs ckpt = tf.placeholder(tf.string) new_size = tf.placeholder(tf.int32) resize_op = resize_cluster(ckpt, new_size) init = tf.global_variables_initializer() # barrier_op = barrier() with tf.Session() as sess: sess.run(init) init_gs = restore(get_init_checkpoint()) np = current_cluster_size() init_np = get_cluster_size(init_gs, cluster_size_schedule, np) if np != init_np: print( '[W] init cluster size (np=%d) is not consistent with schedule (np=%d)' % (np, init_np)) print('restored from %d, np=%d, init_np=%d, start took %s' % (init_gs, np, init_np, show_duration(time.time() - t0))) for gs in range(init_gs, max_step): t0 = time.time() v = sess.run(y) print('step %d, result: %d, np=%d, took %s' % (gs, v, np, show_duration(time.time() - t0)))
def __init__(self, optimizer, name=None, use_locking=False): super(SyncModelAveragingSGDOptimizer, self).__init__(optimizer, name, use_locking=use_locking) self._num_workers = current_cluster_size() self._rank = current_rank()