コード例 #1
0
ファイル: ada_sgd.py プロジェクト: zhz44/KungFu
 def __init__(self, optimizer, interval, name=None, use_locking=False):
     super(AdaptiveSGDOptimizer, self).__init__(optimizer,
                                                name,
                                                use_locking=use_locking)
     self._num_workers = current_cluster_size()
     self._rank = current_rank()
     self._step = tf.Variable(0, trainable=False, dtype=tf.int32)
     self._interval = interval
コード例 #2
0
ファイル: mnist_slp.py プロジェクト: zhz44/KungFu
def train_mnist(sess,
                x,
                y_,
                train_op,
                test_op,
                optimizer,
                dataset,
                n_epochs=1,
                batch_size=5000):

    log_period = 100

    # get the cluster size
    n_shards = current_cluster_size()
    # get the cluster rank of the node
    shard_id = current_rank()

    # calculate number of datapoints per node
    training_set_size = dataset['training_set']['x'].shape[0]
    shard_size = training_set_size // n_shards
    step_per_epoch = shard_size // batch_size
    n_steps = step_per_epoch * n_epochs
    print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps))

    # KUNGFU: Each replica is responsible for a data shard.
    offset = batch_size * shard_id

    sess.run(tf.global_variables_initializer())

    # KUNGFU: KungFu initilizer defines how model weights are initilised on distributed devices
    if hasattr(optimizer, 'distributed_initializer'):
        sess.run(optimizer.distributed_initializer())

    print('training')
    # train the model with all batches allocated to the node
    for step in range(n_steps):
        xs = dataset['training_set']['x'][offset:offset + batch_size]
        y_s = dataset['training_set']['y'][offset:offset + batch_size]
        offset = (offset + batch_size * n_shards) % training_set_size
        sess.run(train_op, {
            x: xs,
            y_: y_s,
        })
        # log the validation accuracy
        if step % log_period == 0:
            training_acc_dataset = dict()
            training_acc_dataset['x'] = xs
            training_acc_dataset['y'] = y_s
            result = test_mnist(sess, x, y_, test_op, training_acc_dataset)
            print('training accuracy: %f' % result)
            result = test_mnist(sess, x, y_, test_op,
                                dataset['validation_set'])
            print('validation accuracy: %f' % result)
コード例 #3
0
ファイル: async_sgd.py プロジェクト: zhz44/KungFu
    def apply_gradients(self, grads_and_vars, **kwargs):
        """Calls this same method on the underlying optimizer."""
        np, rank = current_cluster_size(), current_rank()
        target = get_random_peer(np, rank)
        variables = [v for _g, v in grads_and_vars]
        other_peer_vars, save_model_op = self._build_request_and_save_ops(
            target, variables)

        assign_ops = [
            tf.assign(v, 0.5 * (v + other_v))
            for v, other_v in zip(variables, other_peer_vars)
        ]

        apply_op = self._optimizer.apply_gradients(grads_and_vars, **kwargs)

        with tf.control_dependencies(assign_ops):
            with tf.control_dependencies([apply_op]):
                with tf.control_dependencies([save_model_op]):
                    return tf.group(apply_op)
コード例 #4
0
def train_model(model, dataset, n_epochs=1, batch_size=5000):
    n_shards = current_cluster_size()
    shard_id = current_rank()
    train_data_size = len(dataset['x_train'])

    # calculate the offset for the data of the KungFu node
    shard_size = train_data_size // n_shards
    offset = batch_size * shard_id

    # extract the data for learning of the KungFu node
    x = dataset['x_train'][offset:offset + shard_size]
    y = dataset['y_train'][offset:offset + shard_size]
    # train the model
    model.fit(x,
              y,
              batch_size=batch_size,
              epochs=n_epochs,
              callbacks=[InitalizationCallback()],
              validation_data=(dataset['x_val'], dataset['y_val']),
              verbose=2)
コード例 #5
0
ファイル: test_mnist_slp.py プロジェクト: zhz44/KungFu
def fake_get_shard_info(use_kungfu):
    if use_kungfu:
        from kungfu.ops import current_cluster_size, current_rank
        return current_rank(), current_cluster_size()
    return 0, 1
コード例 #6
0
def show_info_example():
    rank = current_rank()
    np = current_cluster_size()
    print('rank=%d, np=%d' % (rank, np))
コード例 #7
0
ファイル: test_python_apis.py プロジェクト: zhz44/KungFu
def test_peer_info():
    rank = current_rank()
    np = current_cluster_size()
    print('rank=%d, np=%d' % (rank, np))
コード例 #8
0
 def __init__(self, optimizer, name=None, use_locking=False):
     super(SyncModelAveragingSGDOptimizer,
           self).__init__(optimizer, name, use_locking=use_locking)
     self._num_workers = current_cluster_size()
     self._rank = current_rank()