Python allreduce Examples, npu_bridge.hccl.hccl_ops.allreduce Python Examples

Example #1

0

Show file

File: layers.py Project: zjuptian/GoogleNet_Modelarts

 def get_accuracy(self, labels, predicted_classes, logits, args):
     accuracy = tf.metrics.accuracy(labels=labels,
                                    predictions=predicted_classes)
     top5acc = tf.metrics.mean(
         tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32))
     if args.rank_size == 1:
         newaccuracy = (accuracy[0], accuracy[1])
         newtop5acc = (top5acc[0], top5acc[1])
     else:
         from npu_bridge.hccl import hccl_ops
         newaccuracy = (hccl_ops.allreduce(accuracy[0], "sum") /
                        args.rank_size, accuracy[1])
         newtop5acc = (hccl_ops.allreduce(top5acc[0], "sum") /
                       args.rank_size, top5acc[1])
     metrics = {'val-top1acc': newaccuracy, 'val-top5acc': newtop5acc}
     return metrics

Example #2

0

Show file

File: npu_optimizer.py Project: Judithsq/tensorflow

def allreduce(tensor, average=True):
    """
    Perform an allreduce on a tf.Tensor or tf.IndexedSlices.

    Arguments:
        tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce.
        The shape of the input must be identical across all ranks.
        average: If True, computes the average over all ranks.
                 Otherwise, computes the sum over all ranks.

    This function performs a bandwidth-optimal ring allreduce on the input
    tensor. If the input is an tf.IndexedSlices, the function instead does an
    allgather on the values and the indices, effectively doing an allreduce on
    the represented tensor.
    """
    basic = NPUBasics("")
    size = basic.size()
    # the tensor is the instance of tf.IndexedSlices
    if isinstance(tensor, tf.IndexedSlices):
        # For IndexedSlices, do two allgathers intead of an allreduce.
        logging.debug("HcomAllgather...")
        values = hccl_ops.allgather(tensor.values, size)
        indices = hccl_ops.allgather(tensor.indices, size)

        if values is None:
            raise ValueError(
                'the result of tf.HcomAllgather([tensor.values]) is empty')
        if indices is None:
            raise ValueError(
                'the result of tf.HcomAllgather([tensor.indices]) is empty')

        # To make this operation into an average, divide all gathered values by the size.
        rank_size = tf.cast(size, tensor.values.dtype)
        new_values = tf.div(values, rank_size) if average else values

        return tf.IndexedSlices(new_values,
                                indices,
                                dense_shape=tensor.dense_shape)

    else:
        logging.debug("HcomAllReduce...")
        summed_tensor = hccl_ops.allreduce(tensor, "sum")

        if summed_tensor is None:  # and summed_tensor:
            raise ValueError(
                'the result of tf.DavinciAllreduce([tensor]) is empty')

        rank_size = tf.cast(size, dtype=tensor.dtype)
        new_tensor = tf.div(summed_tensor,
                            rank_size) if average else summed_tensor

        return new_tensor

Example #3

0

Show file

File: npu_optimizer.py Project: Ascend/tensorflow

def _npu_allreduce(values,
                   reduction="mean",
                   fusion=1,
                   fusion_id=-1,
                   group="hccl_world_group"):
    mean_reduce = False
    if reduction == "mean":
        mean_reduce = True
        reduction = "sum"

    reduced_values = []
    size = int(os.getenv("RANK_SIZE", "1"))
    for value in values:
        if isinstance(value, tf.IndexedSlices):
            # For IndexedSlices, do two allgathers intead of an allreduce.
            tensor_values = hccl_ops.allgather(value.values, size, group)
            tensor_indices = hccl_ops.allgather(value.indices, size, group)

            if tensor_values is None:
                raise ValueError(
                    'the result of tf.HcomAllgather([value.values]) is empty')
            if tensor_indices is None:
                raise ValueError(
                    'the result of tf.HcomAllgather([value.indices]) is empty')

            # To make this operation into an average, divide all gathered values by the size.
            rank_size = tf.cast(size, value.values.dtype)
            new_values = tf.div(tensor_values,
                                rank_size) if mean_reduce else tensor_values

            reduced_values.append(
                tf.IndexedSlices(new_values,
                                 tensor_indices,
                                 dense_shape=value.dense_shape))
        else:
            summed_tensor = hccl_ops.allreduce(value, reduction, fusion,
                                               fusion_id, group)
            if summed_tensor is None:
                raise ValueError(
                    'the result of tf.DavinciAllreduce([tensor]) is empty')

            rank_size = tf.cast(size, dtype=value.dtype)
            reduced_values.append(
                tf.div(summed_tensor, rank_size
                       ) if mean_reduce else summed_tensor)
    return reduced_values

Example #4

0

Show file

File: npu_optimizer.py Project: Judithsq/tensorflow

    def _reduce_all(self, grads):
        with tf.get_default_graph().control_dependencies(grads):
            local_float_status = gen_npu_ops.npu_get_float_status(
                self._float_status)
            cleared_float_status = gen_npu_ops.npu_clear_float_status(
                local_float_status)

        if self._is_distributed:
            with tf.get_default_graph().control_dependencies(
                [local_float_status]):
                aggregated_float_status = hccl_ops.allreduce(
                    [self._float_status], "sum", fusion=0)
                self._is_overall_finite = math_ops.reduce_all(
                    tf.equal(aggregated_float_status, cleared_float_status))
        else:
            self._is_overall_finite = math_ops.reduce_all(
                tf.equal(self._float_status, cleared_float_status))

Example #5

0

Show file

    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """Apply gradients. See base class `tf.compat.v1.train.Optimizer`."""
        grads = []
        for (g, _) in grads_and_vars:
            if g is not None:
                grads.append(g)

        with tf.get_default_graph().control_dependencies(grads):
            local_float_status = gen_npu_ops.npu_get_float_status(
                self._float_status)
            cleared_float_status = gen_npu_ops.npu_clear_float_status(
                local_float_status)

        if self._is_distributed:
            with tf.get_default_graph().control_dependencies(
                [local_float_status]):
                aggregated_float_status = hccl_ops.allreduce(
                    [self._float_status], "sum", fusion=0)
                is_overall_finite = math_ops.reduce_all(
                    tf.equal(aggregated_float_status, cleared_float_status),
                    name="overflow_status_reduce_all")
        else:
            is_overall_finite = math_ops.reduce_all(
                tf.equal(self._float_status, cleared_float_status),
                name="overflow_status_reduce_all")
        # Only update gradients when all grads are finite.
        def true_apply_gradients_fn():
            # TODO: Check should allreduce before or after _down_scale() ?
            # for now we are calling allreduce before _down_scale
            def true_apply_gradients(grads_and_vars,
                                     global_step=None,
                                     name=None):
                return self._opt.apply_gradients(grads_and_vars, global_step,
                                                 name)

            return true_apply_gradients(grads_and_vars, global_step, name)

        update_vars = control_flow_ops.cond(is_overall_finite,
                                            true_apply_gradients_fn,
                                            gen_control_flow_ops.no_op)

        # Potentially adjust gradient scale in case of finite gradients.
        return control_flow_ops.group(
            update_vars,
            self._loss_scale_manager.update_loss_scale(is_overall_finite))

Example #6

0

Show file

    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """Apply gradients. See base class `tf.compat.v1.train.Optimizer`."""
        if self._enable_overflow_check():
            with tf.name_scope(self._name):
                self._float_status = gen_npu_ops.npu_alloc_float_status()
            grads = []
            for (g, _) in grads_and_vars:
                if g is not None:
                    grads.append(g)
            with tf.get_default_graph().control_dependencies(grads):
                local_float_status = gen_npu_ops.npu_get_float_status(self._float_status)
                cleared_float_status = gen_npu_ops.npu_clear_float_status(local_float_status)

            if self._is_distributed:
                with tf.get_default_graph().control_dependencies([local_float_status]):
                    aggregated_float_status = hccl_ops.allreduce([self._float_status], "sum", fusion=0)
                    is_overall_finite = math_ops.reduce_all(tf.equal(aggregated_float_status,
                                                                     cleared_float_status),
                                                            name="overflow_status_reduce_all")
            else:
                is_overall_finite = math_ops.reduce_all(tf.equal(self._float_status,
                                                                 cleared_float_status),
                                                        name="overflow_status_reduce_all")
        else:
            is_overall_finite = tf.constant(True, dtype=tf.bool)

        def true_apply_grads_fn():
            return self._opt.apply_gradients(grads_and_vars, global_step, name)

        update_variables = control_flow_ops.cond(is_overall_finite,
                                                 true_apply_grads_fn,
                                                 gen_control_flow_ops.no_op)

        # Potentially adjust gradient scale in case of finite gradients.
        return control_flow_ops.group(update_variables,
                                      self._loss_scale_manager.update_loss_scale(is_overall_finite))

Example #7

0

Show file

def main():
    main_graph = tf.Graph()

    model = Model(params)
    input_queues = []
    
    with main_graph.as_default():
        tf.set_random_seed(1)
        train_input_iterator = train_data_gen.make_iterator_initialize(training=True)
        print ('--- train make iterator -----')
        eval_input_iterator = eval_data_gen.make_iterator_initialize(training=False)
        print ('--- eval make iterator -----')
        train_input_init_op = train_input_iterator.initializer
        eval_input_init_op = eval_input_iterator.initializer
    
        train_sample = train_input_iterator.get_next()
        eval_sample = eval_input_iterator.get_next()
    
        # Flags to indicate train or eval      
        float_status = tf.constant([0.0], dtype=tf.float32)
        total_steps = tf.Variable(initial_value=tf.constant(0,tf.int32), trainable=False)
        train_steps = tf.train.get_or_create_global_step()
        total_steps = tf.assign_add(total_steps, 1)
        with tf.control_dependencies([total_steps]):
          eval_flag = tf.mod(total_steps - 1, params['total_steps_per_eval'] )
          init_local_flag = tf.equal( eval_flag, params['training_steps_between_evals']-1 )
    
 
        def train_fn():
            with tf.variable_scope('Res', reuse=False):
                train_op, predicted_label, base_loss, lr, training_s, labels = model.model_func(train_sample[0], train_sample[1], is_training=True, train_steps=train_steps)
              #  train_op, predicted_label, base_loss, labels = model.model_func(train_data, train_label, is_training=True)
                with tf.control_dependencies([train_op]):
                   # train_steps = tf.train.get_or_create_global_step()
                    increase_train_steps_op = tf.assign_add(train_steps, 1, name='NpuCompile')
                    with tf.control_dependencies([increase_train_steps_op]):
                        train_fn_op = tf.no_op(name='train_op_0')
            return train_fn_op, predicted_label, base_loss,lr, training_s, labels
        def eval_fn():
            with tf.variable_scope('Res', reuse=True):
                eval_op, predicted_label, base_loss, lr, training_s,  labels = model.model_func(eval_sample[0], eval_sample[1], is_training=False, train_steps=train_steps)
                with tf.control_dependencies([eval_op]):
                    eval_fn_op = tf.no_op(name='eval_op_0')
            return eval_fn_op, predicted_label, base_loss,lr, training_s, labels
    
    
        # choose to exe train or eval
        final_op, predicted_label, final_base_loss, lr, training_s, labels = tf.cond( eval_flag < params['training_steps_between_evals'], train_fn, eval_fn)
        with tf.control_dependencies([final_op]):
            final_op = tf.no_op(name='Final_op')
    
        # when eval, initial metric's local vars
        float_status = gen_npu_ops.npu_alloc_float_status() # when first step, avoid NaN
        weights = tf.greater(labels, -1)
        eval_value, metric_update_op = tf.metrics.accuracy( labels = labels, predictions=predicted_label, weights=weights )
        with tf.control_dependencies([metric_update_op]):
          #  local_float_status = gen_npu_ops.npu_get_float_status(float_status)
          #  cleared_float_status = gen_npu_ops.npu_clear_float_status(local_float_status)
          #  no_nan = tf.reduce_all( tf.equal( float_status, cleared_float_status ) )
          #  def allreduce_no_nan():
          #      return eval_accuracy
          #  def allreduce_nan():
          #      return tf.constant(0.0, tf.float32)
          #  eval_accuracy = tf.cond( no_nan, allreduce_no_nan, allreduce_nan )
            local_vars = tf.local_variables() # VAR total and count in Metric
            eval_accuracy = tf.divide(local_vars[0],local_vars[1])
            eval_accuracy = hccl_ops.allreduce( eval_accuracy, "sum", fusion=0 )
            print_op = tf.print(eval_accuracy, eval_flag, init_local_flag, total_steps, train_steps, local_vars[0], local_vars[1])
            with tf.control_dependencies([print_op]):
              print_op_2 = tf.identity( eval_accuracy )

        
        
        def clear_local_vars_true():
            clear_op_1 = tf.assign( local_vars[0], tf.constant(0, tf.float32) )
            clear_op_2 = tf.assign( local_vars[1], tf.constant(0, tf.float32) )
            with tf.control_dependencies([clear_op_1, clear_op_2]):
              clear_op = tf.no_op(name='clear_local_vars_true')
            return clear_op
        def clear_local_vars_false():
            clear_op = tf.no_op(name='clear_local_vars_false')
            return clear_op

        with tf.control_dependencies([print_op_2]):
            clear_op_final = tf.cond( init_local_flag, clear_local_vars_true, clear_local_vars_false )

        saver = tf.train.Saver()
    
    main_sess = tf.Session(graph=main_graph, config = config)
    
    
    with main_sess as sess:
            if device_id == 0:
                mllogger.start(key=mllog.constants.INIT_START)
                mllogger.event(key=mllog.constants.GLOBAL_BATCH_SIZE, value=params['global_batch_size'])
                mllogger.event(key="opt_name", value="lars")
                mllogger.event(key="lars_opt_weight_decay", value=params['weight_decay'])
                mllogger.event(key="lars_epsilon", value=0.0)
                mllogger.event(key="lars_opt_base_learning_rate", value=params['learning_rate'])
                mllogger.event(key="lars_opt_end_learning_rate", value=0.0001)
                mllogger.event(key="lars_opt_learning_rate_decay_poly_power", value=2)
                decay_steps = (params['train_epochs'] - params['warmup_epochs'])*params['training_steps_per_epoch']
                mllogger.event(key="lars_opt_learning_rate_decay_steps", value=decay_steps)
                mllogger.event(key="lars_opt_learning_rate_warmup_epochs", value=params['warmup_epochs'])
                mllogger.event(key="lars_opt_momentum", value=params['momentum'])

            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            sess.run(tf.tables_initializer())
            # ################# npu ##################
           # final_op = util.set_iteration_per_loop(sess, final_op, params['iterations_per_loop_train'])
            final_op = util.set_iteration_per_loop(sess, final_op, params['total_steps'])
            # ################# npu ##################
    
            time_start = time.time()
            real_steps = int( float(params['total_steps']) / float( params['iterations_per_loop_train'] ))
            
            # compile graph
            fetches = [final_op, clear_op_final,eval_accuracy, total_steps, train_steps, final_base_loss, eval_flag, lr, training_s]
            npu_compile.npu_compile( sess, fetches )
            sess.run(train_input_init_op)
            sess.run(eval_input_init_op)
            if device_id == 0:         
                mllogger.end(key=mllog.constants.INIT_STOP)
                mllogger.start(key=mllog.constants.RUN_START)
                mllogger.event(key='train_samples', value=params['num_training_samples'])
                mllogger.event(key='eval_samples', value=params['num_evaluate_samples'])
                       
            # start to train & eval
            sess.run( fetches )