def get_accuracy(self, labels, predicted_classes, logits, args): accuracy = tf.metrics.accuracy(labels=labels, predictions=predicted_classes) top5acc = tf.metrics.mean( tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)) if args.rank_size == 1: newaccuracy = (accuracy[0], accuracy[1]) newtop5acc = (top5acc[0], top5acc[1]) else: from npu_bridge.hccl import hccl_ops newaccuracy = (hccl_ops.allreduce(accuracy[0], "sum") / args.rank_size, accuracy[1]) newtop5acc = (hccl_ops.allreduce(top5acc[0], "sum") / args.rank_size, top5acc[1]) metrics = {'val-top1acc': newaccuracy, 'val-top5acc': newtop5acc} return metrics
def allreduce(tensor, average=True): """ Perform an allreduce on a tf.Tensor or tf.IndexedSlices. Arguments: tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce. The shape of the input must be identical across all ranks. average: If True, computes the average over all ranks. Otherwise, computes the sum over all ranks. This function performs a bandwidth-optimal ring allreduce on the input tensor. If the input is an tf.IndexedSlices, the function instead does an allgather on the values and the indices, effectively doing an allreduce on the represented tensor. """ basic = NPUBasics("") size = basic.size() # the tensor is the instance of tf.IndexedSlices if isinstance(tensor, tf.IndexedSlices): # For IndexedSlices, do two allgathers intead of an allreduce. logging.debug("HcomAllgather...") values = hccl_ops.allgather(tensor.values, size) indices = hccl_ops.allgather(tensor.indices, size) if values is None: raise ValueError( 'the result of tf.HcomAllgather([tensor.values]) is empty') if indices is None: raise ValueError( 'the result of tf.HcomAllgather([tensor.indices]) is empty') # To make this operation into an average, divide all gathered values by the size. rank_size = tf.cast(size, tensor.values.dtype) new_values = tf.div(values, rank_size) if average else values return tf.IndexedSlices(new_values, indices, dense_shape=tensor.dense_shape) else: logging.debug("HcomAllReduce...") summed_tensor = hccl_ops.allreduce(tensor, "sum") if summed_tensor is None: # and summed_tensor: raise ValueError( 'the result of tf.DavinciAllreduce([tensor]) is empty') rank_size = tf.cast(size, dtype=tensor.dtype) new_tensor = tf.div(summed_tensor, rank_size) if average else summed_tensor return new_tensor
def _npu_allreduce(values, reduction="mean", fusion=1, fusion_id=-1, group="hccl_world_group"): mean_reduce = False if reduction == "mean": mean_reduce = True reduction = "sum" reduced_values = [] size = int(os.getenv("RANK_SIZE", "1")) for value in values: if isinstance(value, tf.IndexedSlices): # For IndexedSlices, do two allgathers intead of an allreduce. tensor_values = hccl_ops.allgather(value.values, size, group) tensor_indices = hccl_ops.allgather(value.indices, size, group) if tensor_values is None: raise ValueError( 'the result of tf.HcomAllgather([value.values]) is empty') if tensor_indices is None: raise ValueError( 'the result of tf.HcomAllgather([value.indices]) is empty') # To make this operation into an average, divide all gathered values by the size. rank_size = tf.cast(size, value.values.dtype) new_values = tf.div(tensor_values, rank_size) if mean_reduce else tensor_values reduced_values.append( tf.IndexedSlices(new_values, tensor_indices, dense_shape=value.dense_shape)) else: summed_tensor = hccl_ops.allreduce(value, reduction, fusion, fusion_id, group) if summed_tensor is None: raise ValueError( 'the result of tf.DavinciAllreduce([tensor]) is empty') rank_size = tf.cast(size, dtype=value.dtype) reduced_values.append( tf.div(summed_tensor, rank_size ) if mean_reduce else summed_tensor) return reduced_values
def _reduce_all(self, grads): with tf.get_default_graph().control_dependencies(grads): local_float_status = gen_npu_ops.npu_get_float_status( self._float_status) cleared_float_status = gen_npu_ops.npu_clear_float_status( local_float_status) if self._is_distributed: with tf.get_default_graph().control_dependencies( [local_float_status]): aggregated_float_status = hccl_ops.allreduce( [self._float_status], "sum", fusion=0) self._is_overall_finite = math_ops.reduce_all( tf.equal(aggregated_float_status, cleared_float_status)) else: self._is_overall_finite = math_ops.reduce_all( tf.equal(self._float_status, cleared_float_status))
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients. See base class `tf.compat.v1.train.Optimizer`.""" grads = [] for (g, _) in grads_and_vars: if g is not None: grads.append(g) with tf.get_default_graph().control_dependencies(grads): local_float_status = gen_npu_ops.npu_get_float_status( self._float_status) cleared_float_status = gen_npu_ops.npu_clear_float_status( local_float_status) if self._is_distributed: with tf.get_default_graph().control_dependencies( [local_float_status]): aggregated_float_status = hccl_ops.allreduce( [self._float_status], "sum", fusion=0) is_overall_finite = math_ops.reduce_all( tf.equal(aggregated_float_status, cleared_float_status), name="overflow_status_reduce_all") else: is_overall_finite = math_ops.reduce_all( tf.equal(self._float_status, cleared_float_status), name="overflow_status_reduce_all") # Only update gradients when all grads are finite. def true_apply_gradients_fn(): # TODO: Check should allreduce before or after _down_scale() ? # for now we are calling allreduce before _down_scale def true_apply_gradients(grads_and_vars, global_step=None, name=None): return self._opt.apply_gradients(grads_and_vars, global_step, name) return true_apply_gradients(grads_and_vars, global_step, name) update_vars = control_flow_ops.cond(is_overall_finite, true_apply_gradients_fn, gen_control_flow_ops.no_op) # Potentially adjust gradient scale in case of finite gradients. return control_flow_ops.group( update_vars, self._loss_scale_manager.update_loss_scale(is_overall_finite))
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients. See base class `tf.compat.v1.train.Optimizer`.""" if self._enable_overflow_check(): with tf.name_scope(self._name): self._float_status = gen_npu_ops.npu_alloc_float_status() grads = [] for (g, _) in grads_and_vars: if g is not None: grads.append(g) with tf.get_default_graph().control_dependencies(grads): local_float_status = gen_npu_ops.npu_get_float_status(self._float_status) cleared_float_status = gen_npu_ops.npu_clear_float_status(local_float_status) if self._is_distributed: with tf.get_default_graph().control_dependencies([local_float_status]): aggregated_float_status = hccl_ops.allreduce([self._float_status], "sum", fusion=0) is_overall_finite = math_ops.reduce_all(tf.equal(aggregated_float_status, cleared_float_status), name="overflow_status_reduce_all") else: is_overall_finite = math_ops.reduce_all(tf.equal(self._float_status, cleared_float_status), name="overflow_status_reduce_all") else: is_overall_finite = tf.constant(True, dtype=tf.bool) def true_apply_grads_fn(): return self._opt.apply_gradients(grads_and_vars, global_step, name) update_variables = control_flow_ops.cond(is_overall_finite, true_apply_grads_fn, gen_control_flow_ops.no_op) # Potentially adjust gradient scale in case of finite gradients. return control_flow_ops.group(update_variables, self._loss_scale_manager.update_loss_scale(is_overall_finite))
def main(): main_graph = tf.Graph() model = Model(params) input_queues = [] with main_graph.as_default(): tf.set_random_seed(1) train_input_iterator = train_data_gen.make_iterator_initialize(training=True) print ('--- train make iterator -----') eval_input_iterator = eval_data_gen.make_iterator_initialize(training=False) print ('--- eval make iterator -----') train_input_init_op = train_input_iterator.initializer eval_input_init_op = eval_input_iterator.initializer train_sample = train_input_iterator.get_next() eval_sample = eval_input_iterator.get_next() # Flags to indicate train or eval float_status = tf.constant([0.0], dtype=tf.float32) total_steps = tf.Variable(initial_value=tf.constant(0,tf.int32), trainable=False) train_steps = tf.train.get_or_create_global_step() total_steps = tf.assign_add(total_steps, 1) with tf.control_dependencies([total_steps]): eval_flag = tf.mod(total_steps - 1, params['total_steps_per_eval'] ) init_local_flag = tf.equal( eval_flag, params['training_steps_between_evals']-1 ) def train_fn(): with tf.variable_scope('Res', reuse=False): train_op, predicted_label, base_loss, lr, training_s, labels = model.model_func(train_sample[0], train_sample[1], is_training=True, train_steps=train_steps) # train_op, predicted_label, base_loss, labels = model.model_func(train_data, train_label, is_training=True) with tf.control_dependencies([train_op]): # train_steps = tf.train.get_or_create_global_step() increase_train_steps_op = tf.assign_add(train_steps, 1, name='NpuCompile') with tf.control_dependencies([increase_train_steps_op]): train_fn_op = tf.no_op(name='train_op_0') return train_fn_op, predicted_label, base_loss,lr, training_s, labels def eval_fn(): with tf.variable_scope('Res', reuse=True): eval_op, predicted_label, base_loss, lr, training_s, labels = model.model_func(eval_sample[0], eval_sample[1], is_training=False, train_steps=train_steps) with tf.control_dependencies([eval_op]): eval_fn_op = tf.no_op(name='eval_op_0') return eval_fn_op, predicted_label, base_loss,lr, training_s, labels # choose to exe train or eval final_op, predicted_label, final_base_loss, lr, training_s, labels = tf.cond( eval_flag < params['training_steps_between_evals'], train_fn, eval_fn) with tf.control_dependencies([final_op]): final_op = tf.no_op(name='Final_op') # when eval, initial metric's local vars float_status = gen_npu_ops.npu_alloc_float_status() # when first step, avoid NaN weights = tf.greater(labels, -1) eval_value, metric_update_op = tf.metrics.accuracy( labels = labels, predictions=predicted_label, weights=weights ) with tf.control_dependencies([metric_update_op]): # local_float_status = gen_npu_ops.npu_get_float_status(float_status) # cleared_float_status = gen_npu_ops.npu_clear_float_status(local_float_status) # no_nan = tf.reduce_all( tf.equal( float_status, cleared_float_status ) ) # def allreduce_no_nan(): # return eval_accuracy # def allreduce_nan(): # return tf.constant(0.0, tf.float32) # eval_accuracy = tf.cond( no_nan, allreduce_no_nan, allreduce_nan ) local_vars = tf.local_variables() # VAR total and count in Metric eval_accuracy = tf.divide(local_vars[0],local_vars[1]) eval_accuracy = hccl_ops.allreduce( eval_accuracy, "sum", fusion=0 ) print_op = tf.print(eval_accuracy, eval_flag, init_local_flag, total_steps, train_steps, local_vars[0], local_vars[1]) with tf.control_dependencies([print_op]): print_op_2 = tf.identity( eval_accuracy ) def clear_local_vars_true(): clear_op_1 = tf.assign( local_vars[0], tf.constant(0, tf.float32) ) clear_op_2 = tf.assign( local_vars[1], tf.constant(0, tf.float32) ) with tf.control_dependencies([clear_op_1, clear_op_2]): clear_op = tf.no_op(name='clear_local_vars_true') return clear_op def clear_local_vars_false(): clear_op = tf.no_op(name='clear_local_vars_false') return clear_op with tf.control_dependencies([print_op_2]): clear_op_final = tf.cond( init_local_flag, clear_local_vars_true, clear_local_vars_false ) saver = tf.train.Saver() main_sess = tf.Session(graph=main_graph, config = config) with main_sess as sess: if device_id == 0: mllogger.start(key=mllog.constants.INIT_START) mllogger.event(key=mllog.constants.GLOBAL_BATCH_SIZE, value=params['global_batch_size']) mllogger.event(key="opt_name", value="lars") mllogger.event(key="lars_opt_weight_decay", value=params['weight_decay']) mllogger.event(key="lars_epsilon", value=0.0) mllogger.event(key="lars_opt_base_learning_rate", value=params['learning_rate']) mllogger.event(key="lars_opt_end_learning_rate", value=0.0001) mllogger.event(key="lars_opt_learning_rate_decay_poly_power", value=2) decay_steps = (params['train_epochs'] - params['warmup_epochs'])*params['training_steps_per_epoch'] mllogger.event(key="lars_opt_learning_rate_decay_steps", value=decay_steps) mllogger.event(key="lars_opt_learning_rate_warmup_epochs", value=params['warmup_epochs']) mllogger.event(key="lars_opt_momentum", value=params['momentum']) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) # ################# npu ################## # final_op = util.set_iteration_per_loop(sess, final_op, params['iterations_per_loop_train']) final_op = util.set_iteration_per_loop(sess, final_op, params['total_steps']) # ################# npu ################## time_start = time.time() real_steps = int( float(params['total_steps']) / float( params['iterations_per_loop_train'] )) # compile graph fetches = [final_op, clear_op_final,eval_accuracy, total_steps, train_steps, final_base_loss, eval_flag, lr, training_s] npu_compile.npu_compile( sess, fetches ) sess.run(train_input_init_op) sess.run(eval_input_init_op) if device_id == 0: mllogger.end(key=mllog.constants.INIT_STOP) mllogger.start(key=mllog.constants.RUN_START) mllogger.event(key='train_samples', value=params['num_training_samples']) mllogger.event(key='eval_samples', value=params['num_evaluate_samples']) # start to train & eval sess.run( fetches )