Example #1
0
def default_strategy(in_cross_check=False):
    from tensorflow.distribute import get_strategy, in_cross_replica_context
    if in_cross_check:
        if in_cross_replica_context():
            return get_strategy()
    else:
        get_strategy()
Example #2
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        summed_grads_and_vars = []
        for (grad, var) in grads_and_vars:
            if grad is None:
                summed_grads_and_vars.append((grad, var))
            else:
                with ops.colocate_with(grad):
                    # gradient accumulation
                    if self._gradients_to_accumulate > 1 and not self._pipelining:
                        grad = gen_poputil_ops.ipu_stateful_gradient_accumulate(
                            grad / self._gradients_to_accumulate,
                            num_mini_batches=self._gradients_to_accumulate)

                    # replication
                    if self._replicas > 1:
                        grad = gen_poputil_ops.ipu_replication_normalise(
                            cross_replica_ops.cross_replica_sum(grad))

                    # distribution
                    if distribute.has_strategy():
                        grad /= distribute.get_strategy().num_replicas_in_sync

                    grad = math_ops.cast(grad, var.dtype)
                    summed_grads_and_vars.append((grad, var))

        if self._pipelining:
            # can do weight decay here as apply_gradients is only called on last accumulation step
            summed_grads_and_vars = self.add_WD(summed_grads_and_vars)

        ret = self._optimizer.apply_gradients(summed_grads_and_vars,
                                              global_step, name)
        if self._sharded:
            sharding.propagate_sharding(ops.get_default_graph())
        return ret
Example #3
0
    def CreateInstances(cls, *args, **kwargs):
        if not has_strategy():
            return EmbeddingVariable(local_replica_id=0, *args, **kwargs)

        strategy = get_strategy()
        strategy_extended = strategy.extended
        devices = strategy_extended._devices

        value_list = []
        for i, d in enumerate(devices):
            with ops.device(d):
                if i > 0:
                    name = value_list[0].name.split(":")[0]
                    kwargs["name"] = "%s/replica_%d/" % (name, i)
                with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
                    with tape.stop_recording():
                        v = EmbeddingVariable(local_replica_id=i,
                                              *args,
                                              **kwargs)
                value_list.append(v)

        # TODO: check whether it will impact the performance due to the aggregation or synchronization setting.
        return DistributedVariable(
            strategy=strategy,
            values=value_list,
            aggregation=VariableAggregation.ONLY_FIRST_REPLICA,
            var_policy=VariableSynchronization.NONE)
Example #4
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        summed_grads_and_vars = []
        for (grad, var) in grads_and_vars:
            if grad is None:
                summed_grads_and_vars.append((grad, var))
            else:
                with ops.colocate_with(grad):
                    # gradient accumulation
                    if self._gradient_accumulation_count > 1 and not self._pipelining:
                        grad = gen_poputil_ops.ipu_stateful_gradient_accumulate(
                            grad,
                            num_mini_batches=self._gradient_accumulation_count)

                    # replication
                    if self._replicas > 1:
                        grad = gen_poputil_ops.ipu_replication_normalise(
                            cross_replica_ops.cross_replica_sum(grad))

                    # distribution with IPUMultiWorkerStrategy needs additional normalisation by the number of workers
                    if isinstance(
                            distribute.get_strategy(),
                            ipu_multi_worker_strategy.IPUMultiWorkerStrategy):
                        grad /= distribute.get_strategy().num_replicas_in_sync

                    grad = math_ops.cast(grad, var.dtype)
                    summed_grads_and_vars.append((grad, var))

        if self._pipelining:
            # can do weight decay here as apply_gradients is only called on last accumulation step
            summed_grads_and_vars = self.add_WD(summed_grads_and_vars)

        if self._grad_scale != 1.0:
            # don't rescale batch norm moving average statistics as they are not affected by loss scaling
            summed_grads_and_vars = [
                (grad, var) if 'batch_norm/moving_' in var.name else
                (grad / self._grad_scale, var)
                for grad, var in summed_grads_and_vars
            ]
        ret = self._optimizer.apply_gradients(summed_grads_and_vars,
                                              global_step, name)
        if self._sharded:
            sharding.propagate_sharding(ops.get_default_graph())
        return ret
Example #5
0
def Init(**kwargs):
    """
    Abbreviated as ``sok.Init(**kwargs)``.

    This function is used to do the initialization of SparseOperationKit (SOK).

    SOK will leverage all available GPUs for current CPU process. Please set 
    `CUDA_VISIBLE_DEVICES` or `tf.config.set_visible_devices` to specify which 
    GPU(s) are used in this process before launching tensorflow runtime 
    and calling this function.

    In **TensorFlow 2.x**, SOK can be used with **tf.distribute.Strategy** or **Horovod**. 
    When it's used with tf.distribute.Strategy, it must be called under `strategy.scope()`. 
    For example,

    .. code-block:: python
    
        with strategy.scope():
            sok.Init(**kwargs)

    When it's used with Horovod, it must be called at each process. For example,

    .. code-block:: python
    
        import horovod.tensorflow as hvd

        hvd.init()

        sok.Init(**kwargs)

    In **TensorFlow 1.15**, SOK can only work with **Horovod**. The retured status
    must be evaluated with `sess.run`, and it must be the first step before evaluate
    any other SOK APIs.

    .. code-block:: python

        sok_init = sok.Init(global_batch_size=args.global_batch_size)
        with tf.Session() as sess:
            sess.run(sok_init)
            ...

    Parameters
    ----------
    kwargs: dictionary
            keyword arguments for this function. 
            Currently, it must contains `global_batch_size` used in all GPUs.

    Returns
    -------
    status: string
            a string will be returned if this function executed successfully.
            And its contents will be 'OK'.
    """
    def _get_visible_devices():
        gpus = config.get_visible_devices('GPU')
        assert (len(gpus) > 0)
        visible_devices = []
        for i in range(len(gpus)):
            visible_devices.append(int(gpus[i].name.split(':')[-1]))
        return array_ops.constant(visible_devices, dtype=int32)

    @function
    def _single_worker_init(**kwargs):
        replica_ctx = get_replica_context()
        replica_ctx.merge_call(lambda strategy: tf_print(
            "You are using the plugin with MirroredStrategy."))
        nccl_unique_id = replica_ctx.merge_call(
            lambda strategy: kit_lib.get_nccl_unique_id())
        global_random_seed = kwargs.get(
            "seed", None) or replica_ctx.merge_call(
                lambda strategy: kit_lib.gen_random_seed())

        global_id = replica_ctx.replica_id_in_sync_group
        visible_devices = _get_visible_devices()
        status = kit_lib.plugin_init(
            global_id,
            replica_ctx.num_replicas_in_sync,
            nccl_unique_id,
            global_random_seed,
            visible_devices,
            global_batch_size=kwargs['global_batch_size'])
        return status

    def _multi_worker_init(**kwargs):
        replica_ctx = get_replica_context()
        global_id = replica_ctx.replica_id_in_sync_group
        if global_id == 0:
            unique_id = kit_lib.get_nccl_unique_id()
            re = collective_ops.broadcast_send(
                unique_id,
                TensorShape([
                    32,
                ]),
                int32,
                group_size=replica_ctx.num_replicas_in_sync,
                group_key=1,
                instance_key=2)
        else:
            re = collective_ops.broadcast_recv(
                TensorShape([
                    32,
                ]),
                int32,
                group_size=replica_ctx.num_replicas_in_sync,
                group_key=1,
                instance_key=2)
        if global_id == 0:
            global_seed = kwargs.get("seed", None) or kit_lib.gen_random_seed()
            re_seed = collective_ops.broadcast_send(
                global_seed,
                TensorShape([
                    1,
                ]),
                int64,
                group_size=replica_ctx.num_replicas_in_sync,
                group_key=1,
                instance_key=3)
        else:
            global_seed = kwargs.get("seed", None)
            re_seed = collective_ops.broadcast_recv(
                TensorShape([
                    1,
                ]),
                int64,
                group_size=replica_ctx.num_replicas_in_sync,
                group_key=1,
                instance_key=3)

            if (global_seed and global_seed != re_seed):
                logging.warning(
                    "The seed: {} is not consistent with that from cheif-node: {}, "
                    "and the seed from cheif-node will be used.".format(
                        global_seed, re_seed))

        visible_devices = _get_visible_devices()
        status = kit_lib.plugin_init(
            global_id,
            replica_ctx.num_replicas_in_sync,
            re,
            re_seed,
            visible_devices,
            global_batch_size=kwargs['global_batch_size'])
        return status

    # @function
    def _horovod_init(**kwargs):
        r"""
        This function uses horovod to broadcast nccl-id and random-seed which is used by sparse_operation_kit.
        Please note that the nccl-comm mentioned here is not the same one as the nccl-comm of horovod itself.

        After broadcasting, this function uses kit_lib.plugin_init and "nccl-id", "random-seed" to initialize 
        sparse_operation_kit.
        """
        local_rank = hvd.local_rank()

        unique_id = kit_lib.get_nccl_unique_id(
        ) if local_rank == 0 else array_ops.zeros([
            32,
        ], dtype=int32)
        unique_id = hvd.broadcast(unique_id,
                                  root_rank=0,
                                  name="nccl_unique_id")

        seed = kwargs.get("seed", None)
        if 0 == local_rank:
            global_seed = seed or kit_lib.gen_random_seed()
        else:
            global_seed = array_ops.zeros([
                1,
            ], dtype=int64)
        re_seed = hvd.broadcast(global_seed, root_rank=0, name="random_seed")
        if (seed and seed != re_seed):
            logging.warning(
                "The seed: {} is not consistent with that from cheif-node: {}, "
                "and the seed from cheif-node will be used.".format(
                    global_seed, re_seed))

        visible_devices = _get_visible_devices()
        status = kit_lib.plugin_init(
            local_rank,
            hvd.size(),
            unique_id,
            re_seed,
            visible_devices,
            global_batch_size=kwargs["global_batch_size"])
        return status

    def _one_device_init(**kwargs):
        """
        This function use to initialize only one GPU for SOK.
        """
        local_rank = 0
        unique_id = kit_lib.get_nccl_unique_id()
        global_seed = kwargs.get("seed", None) or kit_lib.gen_random_seed()
        visible_devices = _get_visible_devices()
        status = kit_lib.plugin_init(
            local_rank,
            1,
            unique_id,
            global_seed,
            visible_devices,
            global_batch_size=kwargs["global_batch_size"])
        return status

    if has_strategy():
        strategy = get_strategy()

        @function
        def _init_wrapper(run_fn, init_fn, **kwargs):
            return run_fn(init_fn, kwargs=kwargs)

        if isinstance(strategy, MirroredStrategy):
            _init_fn = _single_worker_init
        elif isinstance(strategy, MultiWorkerMirroredStrategy):
            _init_fn = _multi_worker_init
        else:
            raise RuntimeError("This strategy type is not supported yet.")

        if not kit_lib.in_tensorflow2():
            _init_results = _init_wrapper(strategy.experimental_run_v2,
                                          _init_fn, **kwargs)
            if hasattr(_init_results, "values"):
                _init_results = _init_results.values
            return _init_results
        else:
            return _init_wrapper(strategy.run, _init_fn, **kwargs)

    elif "horovod.tensorflow" in sys.modules:
        # imported horovod
        import horovod.tensorflow as hvd

        if not kit_lib.in_tensorflow2():

            @function
            def _init_wrapper(**kwargs):
                return _horovod_init(**kwargs)

            return _init_wrapper(**kwargs)
        else:
            return _horovod_init(**kwargs)
    else:
        # horovod not imported
        return _one_device_init(**kwargs)
Example #6
0
    def __init__(self,
                 shape,
                 local_replica_id,
                 initializer=None,
                 trainable=True,
                 use_hashtable=True,
                 name="EmbeddingVariable",
                 dtype=None,
                 key_dtype=None,
                 *args,
                 **kwargs):
        if (not isinstance(shape, list)) or (len(shape) != 2):
            raise ValueError("shape_per_gpu must be a list which represents: "+\
                             "[vocabulary_size_per_gpu, embedding_vector_size].")
        self.m_shape_per_gpu = TensorShape(shape)
        self.m_local_replica_id = local_replica_id
        self.m_initializer = initializer or InPlaceInitializer(
            name="random_uniform")
        if not isinstance(self.m_initializer, InPlaceInitializer):
            self.m_initializer = tf_initializers.get(self.m_initializer)
        self.m_trainable = trainable
        self.m_use_hashtable = use_hashtable
        self.m_embedding_layer = None
        self.m_dtype = dtype or dtypes.float32
        self.m_key_dtype = key_dtype or dtypes.int64
        # produce intial_value
        if isinstance(self.m_initializer, InPlaceInitializer):
            # TODO: serialize it
            self.m_initial_value = self.m_initializer.name
        else:
            self.m_initial_value = self.m_initializer(
                shape=self.m_shape_per_gpu, dtype=self.m_dtype)

        collections = [ops.GraphKeys.GLOBAL_VARIABLES]
        if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
            collections = list(collections) + [
                ops.GraphKeys.TRAINABLE_VARIABLES
            ]

        with ops.init_scope():
            self._in_graph_mode = not context.executing_eagerly()
            with ops.name_scope(name) as var_name_scope:
                # TODO: use regulare expression
                while var_name_scope[-1] == r"/":
                    var_name_scope = var_name_scope[:-1]
                var_name = var_name_scope
                self.m_var_name = var_name
                self.m_unique_id = "%s_%d" % (var_name, ops.uid())

                # attr = resource_variable_ops.attr_value_pb2.AttrValue(
                #     list=resource_variable_ops.attr_value_pb2.AttrValue.ListValue(
                #         s=[resource_variable_ops.compat.as_bytes("loc:@%s" % self.m_var_name)]))

                # with ops.get_default_graph()._attr_scope({"_class": attr}):
                with ops.NullContextmanager():
                    # m_handle is the handle to EmbeddingVariable, tf_handle is the handle to TF Var.
                    self.m_handle, self.tf_handle = kit_lib.create_var(
                        var_name=var_name,
                        dtype=self.m_dtype,
                        shape=self.m_shape_per_gpu)

                    if self._in_graph_mode:
                        with ops.name_scope("IsInitialized"):
                            self._is_initialized_op = ops.convert_to_tensor(
                                True)  # TODO: should not hard-writing???

                            if (isinstance(self.m_initial_value, ops.Tensor)
                                    and not self.m_initial_value.shape.
                                    is_compatible_with(self.m_shape_per_gpu)):
                                raise ValueError(
                                    "The initial value's shape (%s) is not compatible with "
                                    "the explicitly supplied `shape` argument (%s)."
                                    % (initial_value.shape,
                                       self.m_shape_per_gpu))

                            _init_op = kit_lib.assign_embedding_variable(
                                emb_var_handle=self.m_handle,
                                tf_var_handle=self.tf_handle,
                                var_name=var_name,
                                initial_value=self.m_initial_value,
                                local_replica_id=self.m_local_replica_id,
                                trainable=self.m_trainable,
                                shape=self.m_shape_per_gpu,
                                use_hashtable=self.m_use_hashtable,
                                dtype=self.m_dtype,
                                key_dtype=self.m_key_dtype)
                            self._initializer_op = control_flow_ops.group(
                                (_init_op))
                    else:
                        raise RuntimeError(
                            "Currently, EmbeddingVariable does not support Eager mode."
                        )

                    if not context.executing_eagerly():
                        ops.add_to_collections(collections, self)

            super(EmbeddingVariable, self).__init__(
                trainable=self.m_trainable,
                shape=self.m_shape_per_gpu,
                dtype=self.m_dtype,
                handle=self.m_handle,
                handle_name=var_name,
                distribute_strategy=get_strategy() if has_strategy() else None,
                synchronization=VariableSynchronization.NONE,
                aggregation=VariableAggregation.ONLY_FIRST_REPLICA,
                unique_id=self.m_unique_id,
                initializer_op=self._initializer_op,
                is_initialized_op=self._is_initialized_op,
                *args,
                **kwargs)
            handle_data = resource_variable_ops.cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData(
            )
            handle_data.is_set = True
            handle_data.shape_and_type.append(
                resource_variable_ops.cpp_shape_inference_pb2.
                CppShapeInferenceResult.HandleShapeAndType(
                    shape=self.shape.as_proto(),
                    dtype=self.dtype.as_datatype_enum))
            resource_variable_ops._set_handle_shapes_and_types(
                self.m_handle,
                handle_data,
                graph_mode=False if context.executing_eagerly() else True)
            resource_variable_ops._set_handle_shapes_and_types(
                self.tf_handle,
                handle_data,
                graph_mode=False if context.executing_eagerly() else True)
Example #7
0
    def __init__(self,
                 shape,
                 local_replica_id,
                 initializer=None,
                 trainable=True,
                 use_hashtable=True,
                 name="EmbeddingVariable",
                 dtype=None,
                 key_dtype=None,
                 *args,
                 **kwargs):
        if (not isinstance(shape, list)) or (len(shape) != 2):
            raise ValueError("shape_per_gpu must be a list which represents: "+\
                             "[vocabulary_size_per_gpu, embedding_vector_size].")
        self.m_shape_per_gpu = TensorShape(shape)
        self.m_local_replica_id = local_replica_id
        self.m_initializer = initializer or InPlaceInitializer(name="random_uniform")
        if not isinstance(self.m_initializer, InPlaceInitializer):
            self.m_initializer = tf_initializers.get(self.m_initializer)
        self.m_trainable = trainable
        self.m_use_hashtable = use_hashtable
        self.m_embedding_layer = None
        self.m_dtype = dtype or dtypes.float32
        self.m_key_dtype = key_dtype or dtypes.int64
        # produce intial_value
        if isinstance(self.m_initializer, InPlaceInitializer):
            # TODO: serialize it
            self.m_initial_value = self.m_initializer.name
        else:
            self.m_initial_value = self.m_initializer(shape=self.m_shape_per_gpu, dtype=self.m_dtype)

        with ops.init_scope():
            with ops.name_scope(name):
                self.m_var_name = self._gen_unique_name(name)
                self.m_unique_id = "%s_%d" %(self.m_var_name, ops.uid())

                # m_handle is the handle to EmbeddingVariable, tf_handle is the handle to TF Var.
                self.m_handle, self.tf_handle = kit_lib.create_var(
                                            var_name=self.m_var_name,
                                            dtype=self.m_dtype,
                                            shape=self.m_shape_per_gpu)

                with ops.name_scope("IsInitialized"):
                    self._is_initialized_op = ops.convert_to_tensor(True)

                    if (isinstance(self.m_initial_value, ops.Tensor) and
                        not self.m_initial_value.shape.is_compatible_with(self.m_shape_per_gpu)):
                        raise ValueError("The initial value's shape (%s) is not compatible with "
                                         "the explicitly supplied `shape` argument (%s)." %
                                         (self.m_initial_value.shape, self.m_shape_per_gpu))

                    _init_op = kit_lib.assign_embedding_variable(emb_var_handle=self.m_handle,
                                                            tf_var_handle=self.tf_handle,
                                                            var_name=self.m_var_name,
                                                            initial_value=self.m_initial_value,
                                                            local_replica_id=self.m_local_replica_id,
                                                            trainable=self.m_trainable,
                                                            shape=self.m_shape_per_gpu,
                                                            use_hashtable=self.m_use_hashtable,
                                                            dtype=self.m_dtype,
                                                            key_dtype=self.m_key_dtype)
                    self._initializer_op = control_flow_ops.group((_init_op))

            super(EmbeddingVariable, self).__init__(trainable=self.m_trainable,
                                                    shape=self.m_shape_per_gpu,
                                                    dtype=self.m_dtype,
                                                    handle=self.m_handle,
                                                    handle_name=self.m_var_name,
                                                    distribute_strategy=get_strategy() if has_strategy() else None,
                                                    synchronization=VariableSynchronization.NONE,
                                                    aggregation=VariableAggregation.ONLY_FIRST_REPLICA,
                                                    unique_id=self.m_unique_id,
                                                    *args, **kwargs)

            handle_data = resource_variable_ops.cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData()
            handle_data.is_set = True
            handle_data.shape_and_type.append(
                resource_variable_ops.cpp_shape_inference_pb2.CppShapeInferenceResult.HandleShapeAndType(
                    shape=self.shape.as_proto(), dtype=self.dtype.as_datatype_enum))
            resource_variable_ops._set_handle_shapes_and_types(self.m_handle, handle_data, 
                graph_mode=False if context.executing_eagerly() else True)
            resource_variable_ops._set_handle_shapes_and_types(self.tf_handle, handle_data, 
                graph_mode=False if context.executing_eagerly() else True)