def __init__(self, optimizer, interval, name=None, use_locking=False): super(AdaptiveSGDOptimizer, self).__init__(optimizer, name, use_locking=use_locking) self._num_workers = current_cluster_size() self._rank = current_rank() self._step = tf.Variable(0, trainable=False, dtype=tf.int32) self._interval = interval
def train_mnist(sess, x, y_, train_op, test_op, optimizer, dataset, n_epochs=1, batch_size=5000): log_period = 100 # get the cluster size n_shards = current_cluster_size() # get the cluster rank of the node shard_id = current_rank() # calculate number of datapoints per node training_set_size = dataset['training_set']['x'].shape[0] shard_size = training_set_size // n_shards step_per_epoch = shard_size // batch_size n_steps = step_per_epoch * n_epochs print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps)) # KUNGFU: Each replica is responsible for a data shard. offset = batch_size * shard_id sess.run(tf.global_variables_initializer()) # KUNGFU: KungFu initilizer defines how model weights are initilised on distributed devices if hasattr(optimizer, 'distributed_initializer'): sess.run(optimizer.distributed_initializer()) print('training') # train the model with all batches allocated to the node for step in range(n_steps): xs = dataset['training_set']['x'][offset:offset + batch_size] y_s = dataset['training_set']['y'][offset:offset + batch_size] offset = (offset + batch_size * n_shards) % training_set_size sess.run(train_op, { x: xs, y_: y_s, }) # log the validation accuracy if step % log_period == 0: training_acc_dataset = dict() training_acc_dataset['x'] = xs training_acc_dataset['y'] = y_s result = test_mnist(sess, x, y_, test_op, training_acc_dataset) print('training accuracy: %f' % result) result = test_mnist(sess, x, y_, test_op, dataset['validation_set']) print('validation accuracy: %f' % result)
def apply_gradients(self, grads_and_vars, **kwargs): """Calls this same method on the underlying optimizer.""" np, rank = current_cluster_size(), current_rank() target = get_random_peer(np, rank) variables = [v for _g, v in grads_and_vars] other_peer_vars, save_model_op = self._build_request_and_save_ops( target, variables) assign_ops = [ tf.assign(v, 0.5 * (v + other_v)) for v, other_v in zip(variables, other_peer_vars) ] apply_op = self._optimizer.apply_gradients(grads_and_vars, **kwargs) with tf.control_dependencies(assign_ops): with tf.control_dependencies([apply_op]): with tf.control_dependencies([save_model_op]): return tf.group(apply_op)
def train_model(model, dataset, n_epochs=1, batch_size=5000): n_shards = current_cluster_size() shard_id = current_rank() train_data_size = len(dataset['x_train']) # calculate the offset for the data of the KungFu node shard_size = train_data_size // n_shards offset = batch_size * shard_id # extract the data for learning of the KungFu node x = dataset['x_train'][offset:offset + shard_size] y = dataset['y_train'][offset:offset + shard_size] # train the model model.fit(x, y, batch_size=batch_size, epochs=n_epochs, callbacks=[InitalizationCallback()], validation_data=(dataset['x_val'], dataset['y_val']), verbose=2)
def fake_get_shard_info(use_kungfu): if use_kungfu: from kungfu.ops import current_cluster_size, current_rank return current_rank(), current_cluster_size() return 0, 1
def show_info_example(): rank = current_rank() np = current_cluster_size() print('rank=%d, np=%d' % (rank, np))
def test_peer_info(): rank = current_rank() np = current_cluster_size() print('rank=%d, np=%d' % (rank, np))
def __init__(self, optimizer, name=None, use_locking=False): super(SyncModelAveragingSGDOptimizer, self).__init__(optimizer, name, use_locking=use_locking) self._num_workers = current_cluster_size() self._rank = current_rank()