Ejemplo n.º 1
0
def allgather_object(obj):
    """
    Serializes and allgathers an object from all other processes.

    Arguments:
        obj: An object capable of being serialized without losing any context.

    Returns:
        The list of objects that were allgathered across all ranks.
    """
    # Horovod and its deps are optional, so do not import them at the top.
    # TensorFlow is included here as certain modules (like comet) must import before TF.
    import cloudpickle
    import tensorflow as tf
    from horovod.tensorflow import allgather, size

    def load(byte_array):
        buf = io.BytesIO(byte_array.tobytes())
        return cloudpickle.load(buf)

    b = io.BytesIO()
    cloudpickle.dump(obj, b)

    t = tf.convert_to_tensor(bytearray(b.getvalue()), dtype=tf.uint8)
    sz = tf.convert_to_tensor([t.shape[0]], dtype=tf.int32)

    sizes = allgather(sz, name=type(obj).__name__ + '.sz').numpy()
    gathered = allgather(t, name=type(obj).__name__ + '.t').numpy()

    def select(i):
        start = sizes[i - 1] if i > 0 else 0
        end = start + sizes[i]
        return gathered[start:end]

    return [load(select(i)) for i in range(size())]
Ejemplo n.º 2
0
    def _setup_graph(self):
        num_gpu = cfg.TRAIN.NUM_GPUS
        if cfg.TRAINER == 'replicated':
            # Use two predictor threads per GPU to get better throughput
            self.num_predictor = num_gpu * 2
            self.predictors = [
                self._build_coco_predictor(k % num_gpu)
                for k in range(self.num_predictor)
            ]
            self.dataflows = [
                get_eval_dataflow(shard=k, num_shards=self.num_predictor)
                for k in range(self.num_predictor)
            ]
        else:
            self.predictor = self._build_coco_predictor(0)
            self.dataflow = get_eval_dataflow(shard=hvd.rank(),
                                              num_shards=hvd.size())

            # use uint8 to aggregate strings
            self.local_result_tensor = tf.placeholder(
                tf.uint8, shape=[None], name='local_result_string')
            self.concat_results = hvd.allgather(self.local_result_tensor,
                                                name='concat_results')
            local_size = tf.expand_dims(tf.size(self.local_result_tensor), 0)
            self.string_lens = hvd.allgather(local_size, name='concat_sizes')
Ejemplo n.º 3
0
def hvd_gather_parameters_gradients(list_of_weights, list_of_gradients):
    ''' gather the parameters and gradients from each individual "train_svpg" function call
    Input: List of "weights" from all agents for one batch  (single weights is a list of tensor of weights/biases)
           List of "gradients" from all agents for one batch (single gradienst is a list of Tensors of gradients)

    Output:  2D tf tensor stacked from flattened “trainable_variables” Tensors
             2D tf tes stacked from flattended "gradients" Tensors
             List of the shapes for the weights/gradients
             List of sizes for the weights/gradients
    '''

    # assume 1 mpi process per agent

    # flatten
    weight_list = list(chain.from_iterable(list_of_weights))
    shape_list = [itm.shape for itm in weight_list]
    grad_list = list(chain.from_iterable(list_of_gradients))

    # Convert single list of gradients Tensors to list of numpy arrays
    flat_gradients = tf.concat([tf.reshape(g, [-1]) for g in grad_list],
                               axis=0)
    flat_gradients = tf.expand_dims(flat_gradients, axis=0)

    flat_weights = tf.concat([tf.reshape(vec, [-1]) for vec in weight_list],
                             axis=0)
    flat_weights = tf.expand_dims(flat_weights, axis=0)

    gather_weight_tensor = hvd.allgather(flat_weights)
    gather_gradient_tensor = hvd.allgather(flat_gradients)

    return gather_weight_tensor, gather_gradient_tensor, shape_list
Ejemplo n.º 4
0
    def send_receive(self, tensors, ctx):
        tensors_size = []
        tensors_shape = []
        tensors_ag = []
        for tensor in tensors:
            tensors_size.append(tf.reshape(tf.size(tensor), [-1]))
            tensors_shape.append(tf.shape(tensor))
            tensors_1d = tf.reshape(tensor, [-1])
            tensors_ag.append(allgather(tensors_1d))
        tensors_size = tf.concat(tensors_size, 0)

        if self.compressor.tensors_size_are_same:
            tensors_size_list = [tensors_size] * self.world_size
            tensors_size_ag = tf.concat(tensors_size_list, 0)
        else:
            tensors_size_ag = allgather(tensors_size)

        index = [0] * self.world_size
        num = len(tensors)
        decompressed_tensors = []
        for ranki in range(self.world_size):
            tensors_size = tensors_size_ag[num * ranki:num * (ranki + 1)]
            ranki_tensors = []
            for i, (tensor, shape) in enumerate(zip(tensors_ag,
                                                    tensors_shape)):
                a = index[i]
                b = a + tensors_size[i]
                ranki_tensors.append(tf.reshape(tensor[a:b], shape))
                index[i] = b

            ranki_decompressed = self.compressor.decompress(ranki_tensors, ctx)
            decompressed_tensors.append(ranki_decompressed)

        aggregated_tensor = self.compressor.aggregate(decompressed_tensors)
        return aggregated_tensor
Ejemplo n.º 5
0
    def test_horovod_allgather_grad_cpu(self):
        """Test the correctness of the allgather gradient on CPU."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
            tensor_sizes = tensor_sizes[:size]

            if _executing_eagerly():
                with tf.GradientTape() as tape:
                    tensor = self.tfe.Variable(
                        tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) *
                        rank)
                    if dtype == tf.bool:
                        tensor = tensor % 2
                    tensor = tf.cast(tensor, dtype=dtype)
                    gathered = hvd.allgather(tensor)
                    grad_list = []
                    for r, tensor_size in enumerate(tensor_sizes):
                        g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                        grad_list.append(g)
                    grad_ys = tf.concat(grad_list, axis=0)
                with tf.device("/cpu:0"):
                    grad_out = tape.gradient(gathered, tensor, grad_ys)
            else:
                tensor = tf.ones([tensor_sizes[rank]] + [17] *
                                 (dim - 1)) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                grad_list = []
                for r, tensor_size in enumerate(tensor_sizes):
                    g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                    grad_list.append(g)
                grad_ys = tf.concat(grad_list, axis=0)

                with tf.device("/cpu:0"):
                    grad = tf.gradients(gathered, tensor, grad_ys)[0]
                grad_out = self.evaluate(grad)

            expected = np.ones([tensor_sizes[rank]] + [17] *
                               (dim - 1)) * rank * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
Ejemplo n.º 6
0
    def __init__(self, name):
        self.name = name

        with tf.name_scope("horovod_python_ops/" + name):
            self.allgather_obj_size_inp = tf.placeholder(
                name="allgather_obj_size", dtype=tf.int32, shape=[None])
            self.allgather_obj_inp = tf.placeholder(name="allgather_obj",
                                                    dtype=tf.uint8,
                                                    shape=[None])

            self.allgather_obj_size_result = hvd.allgather(
                self.allgather_obj_size_inp)
            self.allgather_obj_result = hvd.allgather(self.allgather_obj_inp)
Ejemplo n.º 7
0
def all_gather(tensors, axis=0, comm_options=None):
    if tf.distribute.has_strategy():
        replica_ctx = tf.distribute.get_replica_context()
        return replica_ctx.all_gather(tensors, axis=axis, options=comm_options)
    else:
        import horovod.tensorflow as hvd
        return [hvd.allgather(tensor) for tensor in tensors]
Ejemplo n.º 8
0
    def test_horovod_allgather(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        with self.test_session() as session:
            dtypes = [
                tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64,
                tf.float32, tf.float64
            ]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                tensor = tf.ones([17] * dim, dtype=dtype) * rank
                gathered = hvd.allgather(tensor)

                gathered_tensor = session.run(gathered)
                self.assertEqual(list(gathered_tensor.shape),
                                 [17 * size] + [17] * (dim - 1))

                for i in range(size):
                    rank_tensor = tf.slice(gathered_tensor,
                                           [i * 17] + [0] * (dim - 1),
                                           [17] + [-1] * (dim - 1))
                    self.assertEqual(list(rank_tensor.shape), [17] * dim)
                    # tf.equal() does not support tf.uint16 as of TensorFlow 1.2,
                    # so need to cast rank_tensor to tf.int32.
                    self.assertTrue(
                        session.run(
                            tf.reduce_all(
                                tf.equal(tf.cast(rank_tensor, tf.int32), i))),
                        "hvd.allgather produces incorrect gathered tensor")
Ejemplo n.º 9
0
    def test_horovod_allgather_variable_size_fused(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors with
        Tensor Fusion, even if those tensors have different sizes along the
        first dim."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = [
            tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64,
            tf.float16, tf.float32, tf.float64, tf.bool
        ]
        dims = [1, 2, 3]
        tests = []
        shape_tests = []

        for dtype, dim in itertools.product(dtypes, dims):
            # Support tests up to MPI Size of 35
            if size > 35:
                break

            tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5
            tensor_sizes = tensor_sizes[:size]

            tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank
            if dtype == tf.bool:
                tensor = tensor % 2
            tensor = tf.cast(tensor, dtype=dtype)
            gathered = hvd.allgather(tensor)
            shape_tests.append(
                tf.reduce_all(
                    tf.equal(tf.shape(gathered),
                             [sum(tensor_sizes)] + [17] * (dim - 1))))

            for i in range(size):
                rank_size = [tensor_sizes[i]] + [17] * (dim - 1)
                rank_tensor = tf.slice(gathered, [sum(tensor_sizes[:i])] +
                                       [0] * (dim - 1), rank_size)
                self.assertEqual(list(rank_tensor.shape), rank_size)
                if dtype != tf.bool:
                    value = i
                else:
                    value = i % 2

                # tf.equal() does not support tf.uint16 as of TensorFlow 1.2,
                # so need to cast rank_tensor to tf.int32.
                tests.append(
                    tf.reduce_all(
                        tf.equal(tf.cast(rank_tensor, tf.int32), value)))

            shape_tests_passed, value_tests_passed = \
                self.evaluate([tf.reduce_all(shape_tests), tf.reduce_all(tests)])

            self.assertTrue(
                shape_tests_passed,
                "hvd.allgather produces incorrect gathered tensor")

            self.assertTrue(
                value_tests_passed,
                "hvd.allgather produces incorrect gathered tensor")
Ejemplo n.º 10
0
def evaluate(validation_pipeline,
             dlrm,
             timer,
             auc_thresholds,
             data_parallel_splitter,
             max_steps=None,
             cast_dtype=None):

    auc, test_loss = 0, 0
    latencies, all_test_losses = [], []
    distributed = hvd.size() != 1

    pipe = iter(validation_pipeline.op())

    auc_metric = tf.keras.metrics.AUC(num_thresholds=auc_thresholds,
                                      curve='ROC',
                                      summation_method='interpolation',
                                      from_logits=True)
    bce_op = tf.keras.losses.BinaryCrossentropy(
        reduction=tf.keras.losses.Reduction.NONE, from_logits=True)
    for eval_step in range(len(validation_pipeline)):
        begin = time.time()

        (numerical_features, categorical_features), labels = pipe.get_next()

        if hasattr(
                dlrm,
                'data_parallel_bottom_mlp') and dlrm.data_parallel_bottom_mlp:
            numerical_features = data_parallel_splitter(numerical_features)

        if cast_dtype is not None:
            numerical_features = tf.cast(numerical_features, cast_dtype)

        if max_steps is not None and eval_step >= max_steps:
            break

        inputs = _create_inputs_dict(numerical_features, categorical_features)
        y_pred = dlrm(inputs, sigmoid=False, training=False)
        end = time.time()
        latency = end - begin
        latencies.append(latency)

        if distributed:
            y_pred = hvd.allgather(y_pred)

        timer.step_test()
        if hvd.rank() == 0 and auc_metric is not None:
            update_auc_metric(auc_metric, labels, y_pred)
            test_loss = compute_bce_loss(bce_op, labels, y_pred)
            all_test_losses.append(test_loss)

    if hvd.rank() == 0 and dlrm.auc_metric is not None:
        auc = auc_metric.result().numpy().item()
        test_loss = tf.reduce_mean(all_test_losses).numpy().item()

    auc_metric.reset_state()
    return auc, test_loss, latencies
Ejemplo n.º 11
0
def hvd_gather_scores(list_of_epoch_scores):
    score_list = list(list_of_epoch_scores)

    flat_scores = tf.concat([tf.reshape(vec, [-1]) for vec in score_list],
                            axis=0)
    flat_scores = tf.expand_dims(flat_scores, axis=0)

    gather_score_tensor = hvd.allgather(flat_scores)

    return gather_score_tensor
Ejemplo n.º 12
0
def evaluate(validation_pipeline, dlrm, timer, auc_thresholds,
             data_parallel_splitter, max_steps=None, cast_dtype=None):

    auc, test_loss = 0, 0
    latencies, all_test_losses = [], []
    distributed = hvd.size() != 1
    iterator = enumerate(validation_pipeline)

    if hasattr(dlrm, 'auc_metric') and isinstance(dlrm.auc_metric, tf.keras.metrics.AUC):
        auc_metric = dlrm.auc_metric
        bce_op = dlrm.compute_bce_loss
    else:
        auc_metric = tf.keras.metrics.AUC(num_thresholds=auc_thresholds,
                                          curve='ROC', summation_method='interpolation',
                                          from_logits=True)
        bce_op = tf.keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.NONE,
                                                    from_logits=True)
    while True:
        begin = time.time()

        try:
            eval_step, ((numerical_features, categorical_features), labels) = next(iterator)
        except StopIteration:
            break

        if hasattr(dlrm, 'data_parallel_bottom_mlp') and dlrm.data_parallel_bottom_mlp:
            numerical_features = data_parallel_splitter(numerical_features)

        if cast_dtype is not None:
            numerical_features = tf.cast(numerical_features, cast_dtype)

        if max_steps is not None and eval_step >= max_steps:
            break

        inputs = _create_inputs_dict(numerical_features, categorical_features)
        y_pred = dlrm(inputs, False)
        end = time.time()
        latency = end - begin
        latencies.append(latency)

        if distributed:
            y_pred = hvd.allgather(y_pred)

        timer.step_test()
        if hvd.rank() == 0 and auc_metric is not None:
            auc_metric.update_state(labels, y_pred)
            test_loss = bce_op(labels, y_pred)
            all_test_losses.append(test_loss)

    if hvd.rank() == 0 and dlrm.auc_metric is not None:
        auc = auc_metric.result().numpy().item()
        test_loss = tf.reduce_mean(all_test_losses).numpy().item()

    auc_metric.reset_state()
    return auc, test_loss, latencies
Ejemplo n.º 13
0
def evaluate(validation_pipeline,
             dlrm,
             timer,
             auc_thresholds,
             data_parallel_splitter,
             max_steps=None):

    if auc_thresholds is not None:
        auc_metric = tf.keras.metrics.AUC(num_thresholds=auc_thresholds,
                                          curve='ROC',
                                          summation_method='interpolation',
                                          name='my_auc')
    else:
        auc_metric = None

    bce_op = tf.keras.losses.BinaryCrossentropy(
        reduction=tf.keras.losses.Reduction.NONE, from_logits=False)
    auc, test_loss = 0, 0
    latencies, all_test_losses = [], []
    distributed = hvd.size() != 1
    iterator = enumerate(validation_pipeline)
    while True:
        begin = time.time()

        try:
            eval_step, ((numerical_features, categorical_features),
                        labels) = next(iterator)
        except StopIteration:
            break

        if dlrm.data_parallel_bottom_mlp:
            numerical_features = data_parallel_splitter(numerical_features)

        if max_steps is not None and eval_step >= max_steps:
            break

        y_pred = dlrm((numerical_features, categorical_features), sigmoid=True)
        end = time.time()
        latency = end - begin
        latencies.append(latency)

        if distributed:
            y_pred = hvd.allgather(y_pred)

        timer.step_test()
        if hvd.rank() == 0 and auc_metric is not None:
            auc_metric.update_state(labels, y_pred)
            test_loss = bce_op(labels, y_pred)
            all_test_losses.append(test_loss)

    if hvd.rank() == 0 and auc_metric is not None:
        auc = auc_metric.result().numpy().item()
        test_loss = tf.reduce_mean(all_test_losses).numpy().item()

    return auc, test_loss, latencies
Ejemplo n.º 14
0
def allgather(value, name=None):
    """
    Perform an allgather on a tensor-compatible value.

    The concatenation is done on the first dimension, so the input values on the
    different processes must have the same rank and shape, except for the first
    dimension, which is allowed to be different.

    Arguments:
        value: A tensor-compatible value to gather.
        name: Optional name prefix for the constants created by this operation.
    """
    allgather_op = hvd.allgather(tf.constant(value, name=name))
    return K.get_session().run(allgather_op)
Ejemplo n.º 15
0
    def test_horovod_allgather_fused(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors
        with Tensor Fusion."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = [
            tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64,
            tf.float16, tf.float32, tf.float64, tf.bool
        ]
        dims = [1, 2, 3]
        tests = []
        shape_tests = []
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = tf.ones([17] * dim) * rank
            if dtype == tf.bool:
                tensor = tensor % 2
            tensor = tf.cast(tensor, dtype=dtype)
            gathered = hvd.allgather(tensor)

            shape_tests.append(
                tf.reduce_all(
                    tf.equal(tf.shape(gathered),
                             [17 * size] + [17] * (dim - 1))))

            for i in range(size):
                rank_tensor = tf.slice(gathered, [i * 17] + [0] * (dim - 1),
                                       [17] + [-1] * (dim - 1))
                if dtype != tf.bool:
                    value = i
                else:
                    value = i % 2

                # tf.equal() does not support tf.uint16 as of TensorFlow 1.2,
                # so need to cast rank_tensor to tf.int32.
                tests.append(
                    tf.reduce_all(
                        tf.equal(tf.cast(rank_tensor, tf.int32), value)))

            shape_tests_passed, value_tests_passed = \
                self.evaluate([tf.reduce_all(shape_tests), tf.reduce_all(tests)])

            self.assertTrue(
                shape_tests_passed,
                "hvd.allgather produces incorrect gathered tensor")

            self.assertTrue(
                value_tests_passed,
                "hvd.allgather produces incorrect gathered tensor")
Ejemplo n.º 16
0
def allgather(value, name=None):
    """
    Perform an allgather on a tensor-compatible value.

    The concatenation is done on the first dimension, so the input values on the
    different processes must have the same rank and shape, except for the first
    dimension, which is allowed to be different.

    Arguments:
        value: A tensor-compatible value to gather.
        name: Optional name prefix for the constants created by this operation.
    """
    allgather_op = hvd.allgather(tf.constant(value, name=name))
    return K.get_session().run(allgather_op)
Ejemplo n.º 17
0
    def test_horovod_allgather_error(self):
        """Test that the allgather returns an error if any dimension besides
        the first is different among the tensors being gathered."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        tensor_size = [17] * 3
        tensor_size[1] = 10 * (rank + 1)
        tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
        with self.assertRaises(tf.errors.FailedPreconditionError):
            self.evaluate(hvd.allgather(tensor))
Ejemplo n.º 18
0
    def test_horovod_allgather_type_error(self):
        """Test that the allgather returns an error if the types being gathered
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        tensor_size = [17] * 3
        dtype = tf.int32 if rank % 2 == 0 else tf.float32
        tensor = tf.ones(tensor_size, dtype=dtype) * rank
        with self.assertRaises(tf.errors.FailedPreconditionError):
            self.evaluate(hvd.allgather(tensor))
Ejemplo n.º 19
0
    def test_horovod_allgather_variable_size(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors,
        even if those tensors have different sizes along the first dim."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        with self.test_session() as session:
            dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16,
                      tf.int32, tf.int64, tf.float32, tf.float64,
                      tf.bool]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                # Support tests up to MPI Size of 35
                if size > 35:
                    break

                tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5
                tensor_sizes = tensor_sizes[:size]

                tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                gathered_tensor = session.run(gathered)
                expected_size = sum(tensor_sizes)
                self.assertEqual(list(gathered_tensor.shape),
                                 [expected_size] + [17] * (dim - 1))

                for i in range(size):
                    rank_size = [tensor_sizes[i]] + [17] * (dim - 1)
                    rank_tensor = tf.slice(
                        gathered, [sum(tensor_sizes[:i])] + [0] * (dim - 1),
                        rank_size)
                    self.assertEqual(list(rank_tensor.shape), rank_size)
                    # tf.equal() does not support tf.uint16 as of TensorFlow 1.2,
                    # so need to cast rank_tensor to tf.int32.
                    if dtype != tf.bool:
                        value = i
                    else:
                        value = i % 2
                    self.assertTrue(
                        session.run(tf.reduce_all(
                            tf.equal(tf.cast(rank_tensor, tf.int32), value))),
                        "hvd.allgather produces incorrect gathered tensor")
Ejemplo n.º 20
0
    def test_horovod_allgather_error(self):
        """Test that the allgather returns an error if any dimension besides
        the first is different among the tensors being gathered."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            tensor_size = [17] * 3
            tensor_size[1] = 10 * (rank + 1)
            tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allgather(tensor))
Ejemplo n.º 21
0
    def test_horovod_allgather_type_error(self):
        """Test that the allgather returns an error if the types being gathered
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            tensor_size = [17] * 3
            dtype = tf.int32 if rank % 2 == 0 else tf.float32
            tensor = tf.ones(tensor_size, dtype=dtype) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allgather(tensor))
Ejemplo n.º 22
0
def test_horovod_allgather():
    """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
    hvd.init()
    rank = hvd.rank()
    size = hvd.size()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    with tf.Session(config=config) as session:

        tensor = tf.ones([1])* rank
        tensor = tf.cast(tensor, dtype=tf.float32)
        gathered = hvd.allgather(tensor)
        gathered_tensor = session.run(gathered)

        while True:
            print('gathered_tensor = ', gathered_tensor)
Ejemplo n.º 23
0
    def test_horovod_allgather_variable_size(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors,
        even if those tensors have different sizes along the first dim."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        with self.test_session() as session:
            dtypes = [
                tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64,
                tf.float32, tf.float64
            ]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                # Support tests up to MPI Size of 35
                if size > 35:
                    break

                tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5
                tensor_sizes = tensor_sizes[:size]

                tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1),
                                 dtype=dtype) * rank
                gathered = hvd.allgather(tensor)

                gathered_tensor = session.run(gathered)
                expected_size = sum(tensor_sizes)
                self.assertEqual(list(gathered_tensor.shape),
                                 [expected_size] + [17] * (dim - 1))

                for i in range(size):
                    rank_size = [tensor_sizes[i]] + [17] * (dim - 1)
                    rank_tensor = tf.slice(gathered, [sum(tensor_sizes[:i])] +
                                           [0] * (dim - 1), rank_size)
                    self.assertEqual(list(rank_tensor.shape), rank_size)
                    # tf.equal() does not support tf.uint16 as of TensorFlow 1.2,
                    # so need to cast rank_tensor to tf.int32.
                    self.assertTrue(
                        session.run(
                            tf.reduce_all(
                                tf.equal(tf.cast(rank_tensor, tf.int32), i))),
                        "hvd.allgather produces incorrect gathered tensor")
Ejemplo n.º 24
0
def evaluate_wilcoxon(model, dataset):
    @tf.function
    def _step(samples, labels):
        probs = model(samples, training=False)
        return tf.concat([probs, labels], axis=1)

    results = []
    for idx, (samples, labels) in enumerate(dataset):
        result = _step(samples, labels)
        results.append(result)
    results = tf.concat(results, axis=0)

    results = hvd.allgather(results, name='wilcoxon_AUC')

    sort_order = tf.argsort(results[:, 0])
    sorted_label = tf.gather(results[:, 1], sort_order)
    rank = tf.cast(tf.range(1, sorted_label.shape[0]+1), tf.float32)
    num_true = tf.reduce_sum(sorted_label)
    num_false = sorted_label.shape[0] - num_true
    auc = (tf.reduce_sum(rank * sorted_label) - (num_true * (num_true + 1) / 2)) / (num_true * num_false)
    return auc.numpy()
Ejemplo n.º 25
0
    def test_horovod_allgather_grad(self):
        """Test the correctness of the allgather gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        with self.test_session(config=self.config) as session:
            # As of TensorFlow v1.9, gradients are not supported on
            # integer tensors
            dtypes = [tf.float32, tf.float64]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
                tensor_sizes = tensor_sizes[:size]

                tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                grad_list = []
                for r, tensor_size in enumerate(tensor_sizes):
                    g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                    grad_list.append(g)
                grad_ys = tf.concat(grad_list, axis=0)

                grad = tf.gradients(gathered, tensor, grad_ys)[0]
                grad_out = session.run(grad)

                expected = np.ones(
                    [tensor_sizes[rank]] + [17] * (dim - 1)
                ) * rank * size
                err = np.linalg.norm(expected - grad_out)
                self.assertLess(err, 0.00000001,
                                "gradient %s differs from expected %s, "
                                "error: %s" %
                                (grad_out, expected, str(err)))
Ejemplo n.º 26
0
    def test_horovod_allgather_grad(self):
        """Test the correctness of the allgather gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        with self.test_session(config=self.config) as session:
            # As of TensorFlow v1.9, gradients are not supported on
            # integer tensors
            dtypes = [tf.float32, tf.float64]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
                tensor_sizes = tensor_sizes[:size]

                tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                grad_list = []
                for r, tensor_size in enumerate(tensor_sizes):
                    g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                    grad_list.append(g)
                grad_ys = tf.concat(grad_list, axis=0)

                grad = tf.gradients(gathered, tensor, grad_ys)[0]
                grad_out = session.run(grad)

                expected = np.ones(
                    [tensor_sizes[rank]] + [17] * (dim - 1)
                ) * rank * size
                err = np.linalg.norm(expected - grad_out)
                self.assertLess(err, 0.00000001,
                                "gradient %s differs from expected %s, "
                                "error: %s" %
                                (grad_out, expected, str(err)))
Ejemplo n.º 27
0
    def test_horovod_allgather(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        with self.test_session() as session:
            dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16,
                      tf.int32, tf.int64, tf.float32, tf.float64,
                      tf.bool]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                tensor = tf.ones([17] * dim) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                gathered_tensor = session.run(gathered)
                self.assertEqual(list(gathered_tensor.shape),
                                 [17 * size] + [17] * (dim - 1))

                for i in range(size):
                    rank_tensor = tf.slice(gathered_tensor,
                                           [i * 17] + [0] * (dim - 1),
                                           [17] + [-1] * (dim - 1))
                    self.assertEqual(list(rank_tensor.shape), [17] * dim)
                    # tf.equal() does not support tf.uint16 as of TensorFlow 1.2,
                    # so need to cast rank_tensor to tf.int32.
                    if dtype != tf.bool:
                        value = i
                    else:
                        value = i % 2
                    self.assertTrue(
                        session.run(tf.reduce_all(
                            tf.equal(tf.cast(rank_tensor, tf.int32), value))),
                        "hvd.allgather produces incorrect gathered tensor")
Ejemplo n.º 28
0
        f"Creating graph for KNN of {num_train_images} training images ...")
    local_train_files = [(idx, fname, label)
                         for idx, (fname, label) in enumerate(all_train_files)
                         if idx % hvd.size() == hvd.rank()]

    image_input = tf.placeholder(tf.uint8, [None, 224, 224, 3], "image")
    idx_input = tf.placeholder(tf.int64, [None], "image_idx")

    feat_buffer = tf.get_variable("feature_buffer",
                                  shape=[num_train_images, 128],
                                  trainable=False)
    net = ResNetModel(num_output=(2048, 128) if args.v2 else (128, ))
    with TowerContext("", is_training=False):
        feat = net.forward(image_input)
        feat = tf.math.l2_normalize(feat, axis=1)  # Nx128
    all_feat = hvd.allgather(feat)  # GN x 128
    all_idx_input = hvd.allgather(idx_input)  # GN
    update_buffer = tf.scatter_update(feat_buffer, all_idx_input, all_feat)

    dist = tf.matmul(feat, tf.transpose(feat_buffer))  # N x #DS
    _, topk_indices = tf.math.top_k(dist, k=args.top_k)  # Nxtopk

    train_ds = build_dataflow(local_train_files)

    config = get_default_sess_config()
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    def evaluate(checkpoint_file):
        result_file = get_checkpoint_path(
            checkpoint_file) + f".knn{args.top_k}.txt"
        if os.path.isfile(result_file):
Ejemplo n.º 29
0
    def test_horovod_allgather_grad_gpu(self):
        """Test the correctness of the allgather gradient on GPU."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            self.skipTest(("No GPUs available"))

        if os.environ.get('HOROVOD_MIXED_INSTALL'):
            # Skip if compiled with CUDA but without HOROVOD_GPU_ALLREDUCE.
            self.skipTest("Not compiled with HOROVOD_GPU_ALLREDUCE")

        hvd.init()
        rank = hvd.rank()
        local_rank = hvd.local_rank()
        size = hvd.size()

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
            tensor_sizes = tensor_sizes[:size]

            if _executing_eagerly():
                with tf.GradientTape() as tape:
                    tensor = self.tfe.Variable(
                        tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) *
                        rank)
                    if dtype == tf.bool:
                        tensor = tensor % 2
                    tensor = tf.cast(tensor, dtype=dtype)
                    gathered = hvd.allgather(tensor)
                    grad_list = []
                    for r, tensor_size in enumerate(tensor_sizes):
                        g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                        grad_list.append(g)
                    grad_ys = tf.concat(grad_list, axis=0)
                with tf.device("/gpu:%d" % local_rank):
                    grad_out = tape.gradient(gathered, tensor, grad_ys)
            else:
                tensor = tf.ones([tensor_sizes[rank]] + [17] *
                                 (dim - 1)) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                grad_list = []
                for r, tensor_size in enumerate(tensor_sizes):
                    g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                    grad_list.append(g)
                grad_ys = tf.concat(grad_list, axis=0)

                with tf.device("/gpu:%d" % local_rank):
                    grad = tf.gradients(gathered, tensor, grad_ys)[0]
                grad_out = self.evaluate(grad)

            expected = np.ones([tensor_sizes[rank]] + [17] *
                               (dim - 1)) * rank * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
Ejemplo n.º 30
0
xTrainTensor = tf.placeholder(tf.float32)
yTrainTensor = tf.placeholder(tf.float32)
xTestTensor = tf.placeholder(tf.float32)
indexTensor = tf.placeholder(tf.int32)

inputTensor = [xTrainTensor, yTrainTensor, xTestTensor, indexTensor]
tree_fit_predict_tensor = tf.py_func(tree_fit_predict, inputTensor, tf.float32)

currSess.run(tf.global_variables_initializer())

indices = indicesBroadCast.eval()

result = tree_fit_predict(xTrain, yTrain, xTest, indices[rank])

allgatherOp = hvd.allgather(result)

#print("rank is : ",rank, "result  " , result.eval())
print("rank is : ", rank, "indices  ", indices[0][0])
#print(allgatherOp.eval())

AllPreds = allgatherOp.eval()
#print("juggled up : ", np.sum(AllPreds[rank*len(y_test):rank*len(y_test)+len(y_test)] != result.eval() ),"rank:  ",rank)
#print("rank : ",rank,"allpreds :" ,AllPreds)
if rank == 0:
    #print("yTest is ",yTest)

    AllPreds = AllPreds.reshape(-1, len(y_test)).T
    print("Shape is : ", AllPreds.shape)
    MajorityPreds = stats.mode(AllPreds, axis=1)[0]
Ejemplo n.º 31
0
    def get_image_labels(self):
        if self.is_all_shared:
            ### ALL SHARED ###
            img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, 
                                                                 is_training=True)
            with tf.device("/cpu:0"):
                with tf.name_scope("reading"):
                    data_provider = slim.dataset_data_provider.DatasetDataProvider(
                        self.dataset, num_readers=self.FLAGS.num_data_readers,
                        common_queue_capacity=20*self.FLAGS.batch_size,
                        common_queue_min=10*self.FLAGS.batch_size,
                        seed=self.rank)
                    [image, label] = data_provider.get(['image', 'label'])
                with tf.name_scope("to-preprocessing"):
                    capacity = 20 * self.FLAGS.batch_size
                    to_pre_queue = data_flow_ops.FIFOQueue(capacity=capacity,
                                                           dtypes=[image.dtype, label.dtype],
                                                           shapes=None,
                                                           name="to_pre_queue")
                    to_pre_op = to_pre_queue.enqueue([image, label])
                    queue_runner.add_queue_runner(queue_runner.QueueRunner(to_pre_queue, [to_pre_op] * Pipeline.QR_THREADS))
                    tf.summary.scalar("to_pre_fraction_of_%d_full" % capacity,
                                    math_ops.to_float(to_pre_queue.size()) * (1. / capacity))
                    image, label = to_pre_queue.dequeue()
                with tf.name_scope("preprocessing"):#TODO
                    image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode)
                with tf.name_scope("to-allgather"):
                    capacity = 20 * self.FLAGS.batch_size
                    to_allg_queue = data_flow_ops.FIFOQueue(capacity=capacity,
                                                            dtypes=[image.dtype, label.dtype],
                                                            shapes=[[self.train_image_size, self.train_image_size, 3], []],
                                                            name="to_allgather_queue")#[image.get_shape(), label.get_shape()])
                    queue_runner.add_queue_runner(queue_runner.QueueRunner(to_allg_queue, [to_allg_queue.enqueue([image, label])] * Pipeline.QR_THREADS))
                    tf.summary.scalar("to_allgather_fraction_of_%d_full" % capacity,
                                   math_ops.to_float(to_allg_queue.size()) * (1. / capacity))

                # num_preprocessors = tf.placeholder(tf.int32, shape=[], name="num_preprocessors)
                # self.num_hvd_send_tensor = 
                send_images, send_labels = to_allg_queue.dequeue_many(self.num_hvd_send)
                # if rank == #TODO
                all_images = hvd.allgather(send_images, name="hvd_allgather")
                all_labels = hvd.allgather(send_labels, name="hvd_allgather")
                #TODO: Remove extra queues
                with tf.name_scope("to-compute"):
                    capacity = 30 * self.FLAGS.batch_size
                    to_compute_queue = data_flow_ops.FIFOQueue(capacity=capacity,
                                                               dtypes=[image.dtype, label.dtype],
                                                               shapes=[[self.train_image_size, self.train_image_size, 3], []],#TODO
                                                               name="to_compute_queue")#[image.get_shape(), label.get_shape()])
                    queue_runner.add_queue_runner(queue_runner.QueueRunner(to_compute_queue, [to_compute_queue.enqueue_many([all_images, all_labels])]))#1 thread!
                    tf.summary.scalar("to_compute_fraction_of_%d_full" % capacity,
                                   math_ops.to_float(to_compute_queue.size()) * (1. / capacity))
                image, label = to_compute_queue.dequeue()
        elif self.is_single_bcast:
            ### SINGLE BROADCAST ###
            img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, 
                                                                 is_training=True)
            allg_images_name = "allgather-images-op"
            allg_labels_name = "allgather-labels-op"
            bcast_images_name = "bcast-images-op"
            bcast_labels_name = "bcast-labels-op"
            if 0 in self.member_of_group: #If we belong to group 0, initialize the reading and preprocessing pipeline
                with tf.device("/cpu:0"):
                    with tf.name_scope("reading"):
                        data_provider = slim.dataset_data_provider.DatasetDataProvider(
                            self.dataset, num_readers=self.FLAGS.num_data_readers,
                            common_queue_capacity=20*self.FLAGS.batch_size,
                            common_queue_min=10*self.FLAGS.batch_size,
                            seed=self.rank)
                        [image, label] = data_provider.get(['image', 'label'])
                    image, label = create_qr("to-pre", 10 * self.FLAGS.batch_size, [image, label], None, [image.dtype, label.dtype], Pipeline.QR_THREADS, False, False)

                    with tf.name_scope("preprocessing"):
                        image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode)

                    send_images, send_labels = create_qr("to-allg", 10 * self.FLAGS.batch_size, [image, label], [[self.train_image_size, self.train_image_size, 3], []], [image.dtype, label.dtype], Pipeline.QR_THREADS, False, True, self.num_hvd_send)
                all_images = hvd.allgather(send_images, group=0, name=allg_images_name)
                all_labels = hvd.allgather(send_labels, group=0, name=allg_labels_name)
                all_images, all_labels = create_qr("to-bcast", 20 * self.FLAGS.batch_size, [all_images, all_labels], [[self.train_image_size, self.train_image_size, 3], []], [post_pre_image_dtype, post_pre_label_dtype], 1, True, True, self.images_per_bcast)
            if 1 in self.member_of_group:
                # For the middle man rank, reset all_images and all_labels
                # names to their broadcasted tensors so that the bcast is
                # performed. Note that the bcast root is rank 0 since the
                # group1 sent to init had this rank listed first, meaning that
                # the resulting mpi group comm has this rank has rank 0
                if len(self.member_of_group) == 1:
                    # Then not middle man, so construct holder variable WITH CORRECT NAME!
                    # tf.Variable(self.num_hvd_send?
                    all_images = tf.zeros([self.images_per_bcast, self.train_image_size, self.train_image_size, 3], dtype=post_pre_image_dtype)
                    all_labels = tf.zeros([self.images_per_bcast]                                       , dtype=post_pre_label_dtype) #shape of [] turns into 1D instead of 0D
                all_images = hvd.broadcast(all_images, 0, group=1, name=bcast_images_name)
                all_labels = hvd.broadcast(all_labels, 0, group=1, name=bcast_labels_name)
            image, label = create_qr("to-compute", 20 * self.FLAGS.batch_size, [all_images, all_labels], [[self.train_image_size, self.train_image_size, 3], []], [post_pre_image_dtype, post_pre_label_dtype], 1, True, False)
        elif self.is_multi_bcast:
            ### MULTIPLE BROADCAST
            # print("Rank:", rank, member_of_group, group_rank_list)
            img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, 
                                                                 is_training=True)
            # allg_image_name = "allgathered-image" # need some naming commonalities
            # allg_label_name = "allgathered-label"
            allg_images_name = "allgather-images-op"
            allg_labels_name = "allgather-labels-op"
            bcast_images_name = "bcast-images-op"
            bcast_labels_name = "bcast-labels-op"
            # if 0 in member_of_group: #If we belong to group 0, initialize the reading and preprocessing pipeline
            if self.rank < self.FLAGS.num_pre:
                with tf.device("/cpu:0"):
                    with tf.name_scope("reading"):
                        data_provider = slim.dataset_data_provider.DatasetDataProvider(
                            self.dataset, num_readers=self.FLAGS.num_data_readers,
                            common_queue_capacity=20*self.FLAGS.batch_size,
                            common_queue_min=10*self.FLAGS.batch_size,
                            seed=self.rank)
                        [image, label] = data_provider.get(['image', 'label'])

                    image, label = create_qr("to-pre", 10 * self.FLAGS.batch_size, [image, label], None, [image.dtype, label.dtype], Pipeline.QR_THREADS, False, False)

                    with tf.name_scope("preprocessing"):
                        image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode)
                        # image = tf.Print(image, ["using preprocessed image"])
                    send_images, send_labels = create_qr("to-bcast", 20 * self.FLAGS.batch_size, [image, label], [[self.train_image_size, self.train_image_size, 3], []], [image.dtype, label.dtype], 2 * Pipeline.QR_THREADS, False, True, self.images_per_bcast)
            else:
                send_images = tf.zeros([self.images_per_bcast, self.train_image_size, self.train_image_size, 3], dtype=post_pre_image_dtype)
                send_labels = tf.zeros([self.images_per_bcast]                                                 , dtype=post_pre_label_dtype)
            with tf.device("/cpu:0"):
                bcast_images_root = "broadcast-images-"
                bcast_labels_root = "broadcast-labels-"
                bcast_images_per_group = [hvd.broadcast(send_images, i, group=i, name=bcast_images_root + str(i)) for i in range(self.FLAGS.num_pre)]
                bcast_labels_per_group = [hvd.broadcast(send_labels, i, group=i, name=bcast_labels_root + str(i)) for i in range(self.FLAGS.num_pre)]
                
                with tf.name_scope("to-compute"):
                    capacity = 30 * self.FLAGS.batch_size
                    to_compute_q = data_flow_ops.FIFOQueue(capacity=capacity,
                                                    dtypes=[post_pre_image_dtype, post_pre_label_dtype],
                                                    shapes=[[self.train_image_size, self.train_image_size, 3], []], 
                                                    name="to-compute-queue")
                    to_comp_ops = [to_compute_q.enqueue_many([bcast_images_per_group[i], bcast_labels_per_group[i]]) for i in range(self.FLAGS.num_pre)]
                    queue_runner.add_queue_runner(queue_runner.QueueRunner(to_compute_q, to_comp_ops))
                    tf.summary.scalar("to_compute_fraction_of_%d_full" % capacity,
                                      math_ops.to_float(to_compute_q.size()) * (1. / capacity))
                    image, label = to_compute_q.dequeue()
        return image, label
Ejemplo n.º 32
0
 def gather(self, tensor, axis):
     import horovod.tensorflow as hvd
     return hvd.allgather(tensor)
Ejemplo n.º 33
0
def train(*,
          flow_constructor,
          logdir,
          lr_schedule,
          dropout_p,
          seed,
          init_bs,
          total_bs,
          ema_decay,
          steps_per_log,
          epochs_per_val,
          max_grad_norm,
          dtype=tf.float32,
          scale_loss=None,
          restore_checkpoint=None,
          scale_grad=None,
          dataset='cifar10',
          steps_per_extra_samples=None):
    hvd, MPI, is_root, mpi_average = setup_horovod()

    # Seeding and logging setup
    seed_all(hvd.rank() + hvd.size() * seed)
    assert total_bs % hvd.size() == 0
    local_bs = total_bs // hvd.size()

    logger = None
    logdir = '{}_mpi{}_{}'.format(os.path.expanduser(logdir), hvd.size(),
                                  time.time())
    checkpointdir = os.path.join(logdir, 'checkpoints')
    if is_root:
        print('Floating point format:', dtype)
        pprint(locals())
        os.makedirs(logdir)
        os.makedirs(checkpointdir)
        logger = TensorBoardOutput(logdir)

    # Load data
    if is_root:
        # Load once on root first to prevent downloading conflicts
        print('Loading data')
        load_data(dataset=dataset, dtype=dtype.as_numpy_dtype)
    MPI.COMM_WORLD.Barrier()
    data_train, data_val = load_data(dataset=dataset,
                                     dtype=dtype.as_numpy_dtype)
    img_shp = list(data_train.shape[1:])
    if is_root:
        print('Training data: {}, Validation data: {}'.format(
            data_train.shape[0], data_val.shape[0]))
        print('Image shape:', img_shp)
    bpd_scale_factor = 1. / (np.log(2) * np.prod(img_shp))

    # Build graph
    if is_root: print('Building graph')
    dequant_flow, flow = flow_constructor()
    # Data-dependent init
    if is_root: print('===== Init graph =====')
    x_init_sym = tf.placeholder(dtype, [init_bs] + img_shp)
    _, _, init_loss_sym, _ = build_forward(x=x_init_sym,
                                           dequant_flow=dequant_flow,
                                           flow=flow,
                                           flow_kwargs=dict(
                                               vcfg=VarConfig(init=True,
                                                              ema=None,
                                                              dtype=dtype),
                                               dropout_p=dropout_p,
                                               verbose=is_root))
    # Training
    if is_root: print('===== Training graph =====')
    x_sym = tf.placeholder(dtype, [local_bs] + img_shp)
    _, y_sym, loss_sym, _ = build_forward(x=x_sym,
                                          dequant_flow=dequant_flow,
                                          flow=flow,
                                          flow_kwargs=dict(vcfg=VarConfig(
                                              init=False,
                                              ema=None,
                                              dtype=dtype),
                                                           dropout_p=dropout_p,
                                                           verbose=is_root))

    # EMA
    params = tf.trainable_variables()
    if is_root:
        # for p in params:
        #     print(p.name, p.shape)
        print('Parameters',
              sum(np.prod(p.get_shape().as_list()) for p in params))
    ema = tf.train.ExponentialMovingAverage(decay=ema_decay)
    maintain_averages_op = tf.group(ema.apply(params))
    # Op for setting the ema params to the current non-ema params (for use after data-dependent init)
    name2var = {v.name: v for v in tf.global_variables()}
    copy_params_to_ema = tf.group([
        name2var[p.name.replace(':0', '') +
                 '/ExponentialMovingAverage:0'].assign(p) for p in params
    ])

    # Validation and sampling (with EMA)
    if is_root: print('===== Validation graph =====')
    val_flow_kwargs = dict(vcfg=VarConfig(init=False, ema=ema, dtype=dtype),
                           dropout_p=0,
                           verbose=is_root)
    val_dequant_x_sym, val_y_sym, val_loss_sym, _ = build_forward(
        x=x_sym,
        dequant_flow=dequant_flow,
        flow=flow,
        flow_kwargs=val_flow_kwargs)
    # for debugging invertibility
    val_inverr_sym = tf.reduce_max(
        tf.abs(val_dequant_x_sym -
               flow.inverse(val_y_sym, **val_flow_kwargs)[0]))

    if is_root: print('===== Sampling graph =====')
    samples_sym, _ = flow.inverse(
        tf.random_normal(y_sym.shape.as_list(), dtype=dtype),
        **val_flow_kwargs)
    allgathered_samples_sym = hvd.allgather(tf.to_float(samples_sym))
    assert len(tf.trainable_variables()) == len(params)

    def run_validation(sess, i_step):
        data_val_shard = np.array_split(data_val, hvd.size(),
                                        axis=0)[hvd.rank()]
        shard_losses, shard_inverrs = zip(*[
            sess.run([val_loss_sym, val_inverr_sym], {x_sym: val_batch})
            for val_batch, in iterbatches([data_val_shard],
                                          batch_size=local_bs,
                                          include_final_partial_batch=False)
        ])
        val_loss, total_count = mpi_average(shard_losses)
        inv_err, _ = mpi_average(shard_inverrs)
        samples = sess.run(allgathered_samples_sym)
        if is_root:
            logger.writekvs(
                [('val_bpd', bpd_scale_factor * val_loss),
                 ('val_inverr', inv_err),
                 ('num_val_examples', total_count * local_bs),
                 ('samples',
                  tile_imgs(np.clip(samples, 0, 255).astype(np.uint8)))],
                i_step)

    def run_sampling_only(sess, i_step):
        samples = sess.run(allgathered_samples_sym)
        if is_root:
            logger.writekvs(
                [('samples',
                  tile_imgs(np.clip(samples, 0, 255).astype(np.uint8)))],
                i_step)

    # Optimization
    lr_sym = tf.placeholder(dtype, [], 'lr')
    optimizer = hvd.DistributedOptimizer(tf.train.AdamOptimizer(lr_sym))

    if scale_loss is None:
        grads_and_vars = optimizer.compute_gradients(loss_sym, var_list=params)
    else:
        grads_and_vars = [(g / scale_loss, v)
                          for (g, v) in optimizer.compute_gradients(
                              loss_sym * scale_loss, var_list=params)]

    if scale_grad is not None:
        grads_and_vars = [(g / scale_grad, v) for (g, v) in grads_and_vars]
    if max_grad_norm is not None:
        clipped_grads, grad_norm_sym = tf.clip_by_global_norm(
            [g for (g, _) in grads_and_vars], max_grad_norm)
        grads_and_vars = [
            (cg, v) for (cg, (_, v)) in zip(clipped_grads, grads_and_vars)
        ]
    else:
        grad_norm_sym = tf.constant(0.)
    opt_sym = tf.group(optimizer.apply_gradients(grads_and_vars),
                       maintain_averages_op)

    def loop(sess: tf.Session):
        i_step = 0

        if is_root: print('Initializing')
        sess.run(tf.global_variables_initializer())
        if restore_checkpoint is not None:
            # Restore from checkpoint
            if is_root:
                saver = tf.train.Saver()
                print('Restoring checkpoint:', restore_checkpoint)
                restore_step = int(restore_checkpoint.split('-')[-1])
                print('Restoring from step:', restore_step)
                saver.restore(sess, restore_checkpoint)
                i_step = restore_step
            else:
                saver = None
        else:
            # No checkpoint: perform data dependent initialization
            if is_root: print('Data dependent init')
            init_loss = sess.run(
                init_loss_sym, {
                    x_init_sym:
                    data_train[np.random.randint(0, data_train.shape[0],
                                                 init_bs)]
                })
            if is_root: print('Init loss:', init_loss * bpd_scale_factor)
            sess.run(copy_params_to_ema)
            saver = tf.train.Saver() if is_root else None
        if is_root: print('Broadcasting initial parameters')
        sess.run(hvd.broadcast_global_variables(0))
        sess.graph.finalize()

        if is_root:
            print('Training')

        loss_hist = deque(maxlen=steps_per_log)
        gnorm_hist = deque(maxlen=steps_per_log)
        for i_epoch in range(99999999999):
            if i_epoch % epochs_per_val == 0:
                run_validation(sess, i_step=i_step)
                if saver is not None:
                    saver.save(sess,
                               os.path.join(checkpointdir, 'model'),
                               global_step=i_step)

            epoch_start_t = time.time()
            for i_epoch_step, (batch, ) in enumerate(
                    iterbatches(  # non-sharded: each gpu goes through the whole dataset
                        [data_train],
                        batch_size=local_bs,
                        include_final_partial_batch=False,
                    )):

                if steps_per_extra_samples is not None and i_step % steps_per_extra_samples == 0:
                    run_sampling_only(sess, i_step)

                lr = lr_schedule(i_step)
                loss, gnorm, _ = sess.run([loss_sym, grad_norm_sym, opt_sym], {
                    x_sym: batch,
                    lr_sym: lr
                })
                loss_hist.append(loss)
                gnorm_hist.append(gnorm)

                # Skip timing the very first step, which will be unusually slow due to TF initialization
                if i_epoch == i_epoch_step == 0:
                    epoch_start_t = time.time()

                if i_step % steps_per_log == 0:
                    loss_hist_means = MPI.COMM_WORLD.gather(float(
                        np.mean(loss_hist)),
                                                            root=0)
                    gnorm_hist_means = MPI.COMM_WORLD.gather(float(
                        np.mean(gnorm_hist)),
                                                             root=0)
                    steps_per_sec = (i_epoch_step + 1) / (time.time() -
                                                          epoch_start_t)
                    if is_root:
                        kvs = [
                            ('iter', i_step),
                            ('epoch', i_epoch + i_epoch_step * local_bs /
                             data_train.shape[0]),  # epoch for this gpu
                            ('bpd',
                             float(
                                 np.mean(loss_hist_means) * bpd_scale_factor)),
                            ('gnorm', float(np.mean(gnorm_hist_means))),
                            ('lr', float(lr)),
                            ('fps', steps_per_sec * total_bs
                             ),  # fps calculated over all gpus (this epoch)
                            ('sps', steps_per_sec),
                        ]
                        logger.writekvs(kvs, i_step)
                i_step += 1
            # End of epoch

    # Train
    config = tf.ConfigProto()
    # config.log_device_placement = True
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(
        hvd.local_rank())  # Pin GPU to local rank (one GPU per process)
    with tf.Session(config=config) as sess:
        loop(sess)
Ejemplo n.º 34
0
def evaluate(*,
             flow_constructor,
             seed,
             restore_checkpoint,
             total_bs=1024,
             iw_samples=4096,
             dtype=tf.float32,
             dataset='cifar10',
             samples_filename=None):
    hvd, MPI, is_root, mpi_average = setup_horovod()

    restore_checkpoint = os.path.expanduser(restore_checkpoint)

    # Seeding and logging setup
    seed_all(hvd.rank() + hvd.size() * seed)
    assert total_bs % hvd.size() == 0
    local_bs = total_bs // hvd.size()
    assert iw_samples % total_bs == 0

    if is_root:
        print('===== EVALUATING {} ({} IW samples) ====='.format(
            restore_checkpoint, iw_samples))

    # Load data
    if is_root:
        # Load once on root first to prevent downloading conflicts
        print('Loading data')
        load_data(dataset=dataset, dtype=dtype.as_numpy_dtype)
    MPI.COMM_WORLD.Barrier()
    data_train, data_val = load_data(dataset=dataset,
                                     dtype=dtype.as_numpy_dtype)
    img_shp = list(data_train.shape[1:])
    if is_root:
        print('Training data: {}, Validation data: {}'.format(
            data_train.shape[0], data_val.shape[0]))
        print('Image shape:', img_shp)
    bpd_scale_factor = 1. / (np.log(2) * np.prod(img_shp))

    # Build graph
    if is_root: print('Building graph')
    dequant_flow, flow = flow_constructor()
    x_sym = tf.placeholder(dtype, [local_bs] + img_shp)
    # This is a fake training graph. Just used to mimic flow_training, so we can load from the saver
    build_forward(x=x_sym,
                  dequant_flow=dequant_flow,
                  flow=flow,
                  flow_kwargs=dict(vcfg=VarConfig(init=False,
                                                  ema=None,
                                                  dtype=dtype),
                                   dropout_p=0,
                                   verbose=is_root)
                  # note dropout is 0: it doesn't matter
                  )

    # EMA
    params = tf.trainable_variables()
    if is_root:
        print('Parameters',
              sum(np.prod(p.get_shape().as_list()) for p in params))
    ema = tf.train.ExponentialMovingAverage(
        decay=0.9999999999999)  # ema turned off
    maintain_averages_op = tf.group(ema.apply(params))

    # Validation and sampling (with EMA)
    if is_root: print('===== Validation graph =====')
    val_flow_kwargs = dict(vcfg=VarConfig(init=False, ema=ema, dtype=dtype),
                           dropout_p=0,
                           verbose=is_root)
    val_dequant_x_sym, val_y_sym, val_loss_sym, val_logratio_sym = build_forward(
        x=x_sym,
        dequant_flow=dequant_flow,
        flow=flow,
        flow_kwargs=val_flow_kwargs)
    allgathered_val_logratios_sym = hvd.allgather(val_logratio_sym)
    # for debugging invertibility
    val_inverr_sym = tf.reduce_max(
        tf.abs(val_dequant_x_sym -
               flow.inverse(val_y_sym, **val_flow_kwargs)[0]))

    if is_root: print('===== Sampling graph =====')
    samples_sym, _ = flow.inverse(
        tf.random_normal(val_y_sym.shape.as_list(), dtype=dtype),
        **val_flow_kwargs)
    allgathered_samples_sym = hvd.allgather(tf.to_float(samples_sym))
    assert len(tf.trainable_variables()) == len(params)

    def run_iw_eval(sess):
        if is_root:
            print('Running IW eval with {} samples...'.format(iw_samples))
        # Go through one example at a time
        all_val_losses = []
        for i_example in (trange if is_root else range)(len(data_val)):
            # take this single example and tile it
            batch_x = np.tile(data_val[i_example, None, ...],
                              (local_bs, 1, 1, 1))
            # repeatedly evaluate logd for the IWAE bound
            batch_logratios = np.concatenate([
                sess.run(allgathered_val_logratios_sym, {x_sym: batch_x})
                for _ in range(iw_samples // total_bs)
            ]).astype(np.float64)
            assert batch_logratios.shape == (iw_samples, )
            # log [1/n \sum_i exp(r_i)] = log [exp(-b) 1/n \sum_i exp(r_i + b)] = -b + log [1/n \sum_i exp(r_i + b)]
            shift = batch_logratios.max()
            all_val_losses.append(
                -bpd_scale_factor *
                (shift + np.log(np.mean(np.exp(batch_logratios - shift)))))
            if i_example % 100 == 0 and is_root:
                print(i_example, np.mean(all_val_losses))
        if is_root:
            print(f'Final ({len(data_val)}):', np.mean(all_val_losses))

    def run_standard_eval(sess):
        if is_root:
            print('Running standard eval...')
        # Standard validation (single sample)
        data_val_shard = np.array_split(data_val, hvd.size(),
                                        axis=0)[hvd.rank()]
        shard_losses, shard_inverrs = zip(*[
            sess.run([val_loss_sym, val_inverr_sym], {x_sym: val_batch})
            for val_batch, in iterbatches([data_val_shard],
                                          batch_size=local_bs,
                                          include_final_partial_batch=False)
        ])
        val_loss, total_count = mpi_average(shard_losses)
        inv_err, _ = mpi_average(shard_inverrs)
        if is_root:
            for k, v in [
                ('val_bpd', bpd_scale_factor * val_loss),
                ('val_inverr', inv_err),
                ('num_val_examples', total_count * local_bs),
            ]:
                print(k, v)

    def run_sampling_only(sess):
        samples = sess.run(allgathered_samples_sym)
        # # warmup a few times
        # for _ in range(10):
        #     sess.run(allgathered_samples_sym)
        # # start timing
        # trials = 100
        # tstart = time.time()
        # for _ in range(trials):
        #     samples = sess.run(allgathered_samples_sym)
        # sample_time = (time.time() - tstart) / trials

        if is_root:
            from PIL import Image
            Image.fromarray(
                tile_imgs(np.clip(samples, 0, 255).astype(
                    np.uint8))).save(samples_filename)
            print('Saved {} samples to {}'.format(len(samples),
                                                  samples_filename))
            # print('Sampled in {} seconds'.format(sample_time))

    # Run
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(
        hvd.local_rank())  # Pin GPU to local rank (one GPU per process)
    with tf.Session(config=config) as sess:
        if is_root: print('Initializing')
        sess.run(tf.global_variables_initializer())
        # Restore from checkpoint
        if is_root:
            print('Restoring checkpoint:', restore_checkpoint)
            saver = tf.train.Saver()
            saver.restore(sess, restore_checkpoint)
            print('Broadcasting initial parameters')
        sess.run(hvd.broadcast_global_variables(0))
        sess.graph.finalize()

        if samples_filename:
            run_sampling_only(sess)

        # Make sure data is the same on all MPI processes
        tmp_inds = [0, 183, 3, 6, 20, 88]
        check_batch = np.ascontiguousarray(data_val[tmp_inds])
        gathered_batches = np.zeros(
            (hvd.size(),
             *check_batch.shape), check_batch.dtype) if is_root else None
        MPI.COMM_WORLD.Gather(check_batch, gathered_batches, root=0)
        if is_root:
            assert all(
                np.allclose(check_batch, b)
                for b in gathered_batches), 'data must be in the same order!'
            print('data ordering ok')

        # Run validation
        run_standard_eval(sess)
        run_iw_eval(sess)
Ejemplo n.º 35
0
def allgather(backend, value, name):
    return _eval(backend, hvd.allgather(tf.constant(value, name=name)))
Ejemplo n.º 36
0
def allgather(backend, value, name):
    allgather_op = hvd.allgather(tf.constant(value, name=name))
    return backend.get_session().run(allgather_op)