Esempio n. 1
0
        def __init__(self, name, device_dense, device_sparse, compression, sparse_as_dense,
                     config, aggregation_frequency, grad_updated_sizes_dict, profile_frequency,
                     profile_filename, average_aggregated_gradients):
            if name is None:
                name = "Distributed%s" % self.__class__.__base__.__name__
            self._name = name
            self._device_dense = device_dense
            self._device_sparse = device_sparse
            self._compression = compression
            self._sparse_as_dense = sparse_as_dense
            self._aggregated_gradients = False

            # We save the result of this because `get_gradients` and
            # `apply_gradients` do not execute eagerly.
            self._executing_eagerly = hvd._executing_eagerly()

            if not self._executing_eagerly:
                self._agg_helper = LocalGradientAggregationHelper(
                    aggregation_frequency,
                    _make_allreduce_grads_fn(device_dense, device_sparse, compression),
                    sparse_as_dense,
                    grad_updated_sizes_dict,
                    average_aggregated_gradients
                )
                self._profile_helper = TFProfileHelper(profile_frequency, profile_filename)

            super(self.__class__, self).__init__(**config)
Esempio n. 2
0
        def __init__(self, **kwargs):
            self._name = name or "Distributed%s" % self.__class__.__base__.__name__
            self._aggregated_gradients = False

            self._allreduce_grads = hvd._make_allreduce_grads_fn(
                self._name, device_dense, device_sparse, compression,
                sparse_as_dense, op, gradient_predivide_factor)

            self._agg_helper = None
            if backward_passes_per_step > 1:
                if hvd._executing_eagerly():
                    self._agg_helper = LocalGradientAggregationHelperEager(
                        backward_passes_per_step=backward_passes_per_step,
                        allreduce_func=self._allreduce_grads,
                        sparse_as_dense=sparse_as_dense,
                        average_aggregated_gradients=
                        average_aggregated_gradients,
                    )
                else:
                    self._agg_helper = LocalGradientAggregationHelper(
                        backward_passes_per_step=backward_passes_per_step,
                        allreduce_func=self._allreduce_grads,
                        sparse_as_dense=sparse_as_dense,
                        average_aggregated_gradients=
                        average_aggregated_gradients,
                        rank=rank(),
                        optimizer_type=LocalGradientAggregationHelper.
                        _OPTIMIZER_TYPE_KERAS,
                    )

            super(self.__class__, self).__init__(**kwargs)
Esempio n. 3
0
    def on_batch_end(self, batch, logs=None):
        if self.broadcast_done:
            return

        with tf.device(self.device):
            if hvd._executing_eagerly() and hasattr(self.model, 'variables'):
                # TensorFlow 2.0 or TensorFlow eager
                hvd.broadcast_variables(self.model.variables,
                                        root_rank=self.root_rank)
                hvd.broadcast_variables(self.model.optimizer.variables(),
                                        root_rank=self.root_rank)
            else:
                bcast_op = hvd.broadcast_global_variables(self.root_rank)
                self.backend.get_session().run(bcast_op)

        self.broadcast_done = True
Esempio n. 4
0
    def test_broadcast_object_fn(self):
        if LooseVersion(tf.__version__) < LooseVersion('1.15.0'):
            self.skipTest(
                "Broadcasting object requires TensorFlow 1.15 or above")

        if hvd._executing_eagerly() or _IS_TF2:
            # Only for TF 1.0 in graph mode
            return

        hvd.init()

        with tf.device("/cpu:0"):
            expected_obj = {'hello': 123, 0: [1, 2]}
            obj = expected_obj if hvd.rank() == 0 else {}

            bcast = hvd.broadcast_object_fn(root_rank=0)
            obj = bcast(obj)
            self.assertDictEqual(obj, expected_obj)
Esempio n. 5
0
 def _average_metrics_in_place(self, logs):
     logs = logs or {}
     reduced_logs = {}
     # Reduce every metric among workers. Sort metrics by name
     # to ensure consistent order.
     for metric, value in sorted(logs.items()):
         if hvd._executing_eagerly():
             reduced_logs[metric] = \
                 hvd.allreduce(tf.constant(value, name=metric)).numpy()
         else:
             if metric not in self.variables:
                 self.variables[metric], self.allreduce_ops[metric] = \
                     self._make_variable(metric, value)
             else:
                 self.backend.set_value(self.variables[metric], value)
             reduced_logs[metric] = \
                 self.backend.get_session().run(self.allreduce_ops[metric])
     # Override the reduced values back into logs dictionary
     # for other callbacks to use.
     for metric, value in reduced_logs.items():
         logs[metric] = value
Esempio n. 6
0
def _eval(backend, op_or_result):
    if hvd._executing_eagerly():
        return op_or_result
    else:
        return backend.get_session().run(op_or_result)
Esempio n. 7
0
    def test_elastic_state(self):
        if LooseVersion(tf.__version__) < LooseVersion('1.15.0'):
            self.skipTest(
                "Broadcasting object requires TensorFlow 1.15 or above")

        if not hvd._executing_eagerly() and _IS_TF2:
            # Only support TF 2.0 in eager mode
            return

        hvd.init()

        with tf.device("/cpu:0"):
            v = 1.0 if hvd.rank() == 0 else 2.0
            weights1 = [np.array([[v, v], [v, v]]), np.array([v, v])]
            vars1 = [tf.Variable(arr) for arr in weights1]

            weights2 = [
                np.array([[1.0, 2.0], [3.0, 4.0]]),
                np.array([0.0, 0.0])
            ]

            if not hvd._executing_eagerly():
                init = tf.global_variables_initializer()
                self.evaluate(init)

            state = hvd.elastic.TensorFlowState(vars1,
                                                batch=20 + hvd.rank(),
                                                epoch=10 + hvd.rank())
            state.sync()

            weights1 = [np.ones_like(w) for w in weights1]

            # After sync, all values should match the root rank
            for w in self.evaluate(vars1):
                self.assertAllClose(w, np.ones_like(w))
            assert state.batch == 20
            assert state.epoch == 10

            # Partially modify then restore
            self.assign(vars1, weights2)
            state.batch = 21
            state.epoch = 11

            state.restore()

            for w1, w2 in zip(self.evaluate(vars1), weights1):
                self.assertAllClose(w1, w2)
            assert state.batch == 20
            assert state.epoch == 10

            # Partially modify then commit
            self.assign(vars1, weights2)
            state.batch = 21
            state.epoch = 11

            state.commit()
            state.restore()

            for w1, w2 in zip(self.evaluate(vars1), weights2):
                self.assertAllClose(w1, w2)
            assert state.batch == 21
            assert state.epoch == 11
Esempio n. 8
0
def main(events, make_model_fn, div, dataset, default_verbosity, data_dir, checkpoint_dir, log_to, log_info):
    world = MPI.COMM_WORLD
    rank = world.Get_rank()
    size = world.Get_size()

    wrh.open(str(log_to) % {
        'rank': rank,
        'size': size,
        'rank+1': rank + 1,
    }, 'a')

    if log_info is not None:
        wrh.load(log_info % {
            'rank': rank,
            'rank+1': rank + 1,
            'size': size,
        })
    else:
        if rank == 0:
            wrh.push('master')
            for i in range(1, size):
                wrh.push('worker')
                info = wrh.save()
                world.send(info, dest=i, tag=i)
                wrh.pop('worker')
            wrh.push('worker')
        else:
            info = world.recv(source=0, tag=rank)
            wrh.load(info)

    wrh.push('triple-r.py')
    wrh.log('rank', '%d', rank)
    wrh.log('size', '%d', size)
    wrh.log('model', '%s', make_model_fn)
    wrh.log('dataset', '%s', dataset)
    wrh.log('events', '%s', events)
    wrh.log('div', '%d', div)
    wrh.log('data_dir', '%s', data_dir)
    wrh.log('checkpoint_dir', '%s', checkpoint_dir)

    wrh.push('initialize horovod')
    hvd.init(world)
    wrh.pop('initialize horovod')

    wrh.log('hvd.mpi_threads_supported', '%r', hvd.mpi_threads_supported())
    assert hvd.mpi_threads_supported()

    wrh.log('_executing_eagerly', '%r', _executing_eagerly())

    #print(f'{hvd.

    is_emnist = dataset in ('emnist',)
    is_tiny_imagenet = dataset in ('tiny-imagenet',)

    wrh.push('loading dataset')
    train_ds = None
    valid_ds = None
    if is_emnist:
        datasets, info = tfds.load(
            dataset,
            split=None,
            with_info=True,
            as_supervised=True,
            data_dir=str(data_dir),
            download=True,
        )
        wrh.log('datasets', '%r', datasets)
        wrh.log('info', '%r', info)
        input_shape = info.features['image'].shape
        output_shape = info.features['label'].num_classes
        train_ds = datasets['train']
        valid_ds = datasets['valid']

        train_ds = train_ds.map(lambda img, label: (tf.image.convert_image_dtype(img, dtype=tf.float32), label))
        valid_ds = valid_ds.map(lambda img, label: (tf.image.convert_image_dtype(img, dtype=tf.float32), label))

        num_train = info.splits['train'].num_examples
        num_valid = info.splits['validation'].num_examples
    elif is_tiny_imagenet:
        # Training data iterator.
        input_shape = (224, 224, 3)
        output_shape = 200

        num_train = 100000
        num_valid = 10000

        train_dir = data_dir / "tiny-imagenet-200/train"
        valid_dir = data_dir / "tiny-imagenet-200/val"

        def drop_first_dimension(tensor: 'tf.Tensor') -> 'tf.Tensor':
            #print(f'{tensor = }')
            #shape = tensor.get_shape()
            #print(f'{shape = }')
            #tensor.set_shape(shape[1:])
            return tensor

        def add_first_dimension(tensor):
            shape = tensor.get_shape()
            tensor.set_shape([1, *shape])
            return tensor

        def debug(s: str, tensor: 'tf.Tensor') -> 'tf.Tensor':
            print(f'{s}: {tensor = } (type = {type(tensor)})')
            return tensor

        train_gen = tf.keras.preprocessing.image.ImageDataGenerator(
            width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True,
            preprocessing_function=tf.keras.applications.resnet50.preprocess_input)

        train_ds = tf.data.Dataset.from_generator(
            lambda: train_gen.flow_from_directory(train_dir,
                                                  batch_size=1,
                                                  target_size=input_shape[:-1]),
            output_signature=(tf.TensorSpec(shape=[1, *input_shape], dtype=tf.float32),
                              tf.TensorSpec(shape=(1, output_shape,), dtype=tf.int32)),
        ) \
            .unbatch()
            #.map(lambda x, y: (debug('before x', x), debug('before y', y))) \
            #.map(lambda x, y: (debug('after x', x), debug('after y', y))) \
            #.map(lambda x, y: (debug('unbatch x', x), debug('unbatch y', y))) \
            #.map(lambda x, y: (x, tf.expand_dims(y, axis=0))) \

        # Validation data iterator.
        valid_gen = tf.keras.preprocessing.image.ImageDataGenerator(
            zoom_range=(0.875, 0.875), preprocessing_function=tf.keras.applications.resnet50.preprocess_input)
        valid_ds = tf.data.Dataset.from_generator(
            lambda: valid_gen.flow_from_directory(valid_dir,
                                                  batch_size=1,
                                                  target_size=input_shape[:-1]),
            output_signature=(tf.TensorSpec(shape=[1, *input_shape], dtype=tf.float32),
                              tf.TensorSpec(shape=(1, output_shape,), dtype=tf.int32)),
        ) \
            .unbatch()
        
    wrh.pop('loading dataset')

    wrh.push('creating model')
    wrh.log('input_shape', '%r', input_shape)
    wrh.log('output_shape', '%r', output_shape)
    model = make_model_fn(
        input_shape=input_shape,
        output_shape=output_shape,
    )
    wrh.pop('creating model')

    callbacks = [
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        hvd.callbacks.MetricAverageCallback(),
        PreciseEarlyStopping(nepochs=-1, nbatches=-1),
    ]

    if rank == 0:
        pass # callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_dir / 'checkpoint.h5', save_weights_only=False))

    if rank == 0:
        wrh.push('checkpoint')
        weights = checkpoint_dir / 'checkpoint.h5'
        model.save(weights)
        wrh.pop('checkpoint')

    #events.insert(0, Event(nepochs=0, nworkers=size, batch=32, reload=False))

    initial_epoch = 0
    for event in events:
        wrh.push('event')
        wrh.log('event', '%r', event)

        opt = tf.keras.optimizers.Adam(0.001)
        print(f'{rank=} {opt.__class__ = }, {opt.__class__.__base__ = }')

        opt = hvd.DistributedOptimizer(
            opt,
            backward_passes_per_step=1,
            average_aggregated_gradients=True,
        )
        print(f'{rank=} {opt.__class__ = }, {opt.__class__.__base__ = }')

        if rank == -1:
            opt = create_no_op_optimizer(opt)
            print(f'{rank=} {opt.__class__ = }, {opt.__class__.__base__ = }')

       # old_allreduce = opt._allreduce
       # def _allreduce(grads):
       #     print(f'{rank=} {grads = }')
       #     return old_allreduce(grads)
       # opt._allreduce = _allreduce

        model.compile(
            optimizer=opt,
            metrics=['accuracy'],
            loss=tf.losses.CategoricalCrossentropy(from_logits=True),
            experimental_run_tf_function=False,
        )

        if event.reload:
            wrh.push('reload')
            print(f'Reloading weights')
            #weights = tf.train.latest_checkpoint(checkpoint_dir)
            weights = checkpoint_dir / 'checkpoint.h5'
            if weights is None:
                print(f'Error! Could not load weights!')
                print(f'{checkpoint_dir = }')
                for path in checkpoint_dir.iterdir():
                    print(f'  {path = }')
                raise ValueError('Could not load weights')
            wrh.log('weights', '%r', weights)
            model = hvd.load_model(weights)
            wrh.pop('reload')

        wrh.push('train')
        model.fit(
            train_ds.repeat().batch(event.batch),
            steps_per_epoch=num_train // event.batch // event.nworkers // div,
            callbacks=callbacks,
            epochs=initial_epoch + event.nepochs,
            initial_epoch=initial_epoch,
            verbose=default_verbosity if hvd.rank() == 0 else 0,
        )
        wrh.pop('train')

        wrh.push('valid')
        stats = model.evaluate(
            valid_ds.repeat().batch(event.batch),
            steps=num_valid // event.batch // event.nworkers // div,
            callbacks=callbacks,
            verbose=default_verbosity if hvd.rank() == 0 else 0,
        )
        if rank == 0:
            print(f'stats = {" ".join(f"{name}={value}" for name, value in zip(model.metrics_names, stats))}')
        for name, value in zip(model.metrics_names, stats):
            wrh.log(name, '%r', value)
        wrh.pop('valid')

        if event.checkpoint and rank == 0:
            wrh.push('checkpoint')
            weights = checkpoint_dir / 'checkpoint.h5'
            model.save(weights)
            wrh.pop('checkpoint')

        world.Barrier()

        initial_epoch += event.nepochs

        wrh.pop('event')

    wrh.pop('triple-r.py')

    if rank == 0:
        wrh.pop('worker')
        wrh.pop('master')