def _get_aggregated_dense_grad(self, graph_item, grad_name, reduce_to_device, BFTaggregator): grad_op_name = strip_replica_prefix(get_op_name(grad_name)) output_idx = get_index_from_tensor_name(grad_name) grad_ops = [ graph_item.graph.get_operation_by_name( ops.prepend_name_scope(grad_op_name, replica_prefix(i))) for i in range(self.num_replicas) ] # Aggregate gradients on `reduce_to_device` (usually CPU) with ops.device(reduce_to_device): #print("@@@@@@@@@@@@@@",[grad_op.outputs[output_idx] for grad_op in grad_ops]) ''' grad_sum_op_name = ops.prepend_name_scope(grad_op_name, u"%sAdd" % AUTODIST_PREFIX) grad_sum = math_ops.add_n([grad_op.outputs[output_idx] for grad_op in grad_ops], name=grad_sum_op_name) grad_avg_op_name = ops.prepend_name_scope(grad_op_name, u"%sDiv" % AUTODIST_PREFIX) grad_avg = math_ops.realdiv(grad_sum, self.num_replicas, name=grad_avg_op_name) ''' # BFT Aggregator gradients = [grad_op.outputs[output_idx] for grad_op in grad_ops] grad_avg = BFTaggregator.aggregate(gradients) #print("$$$$$$$$$$$$$$",grad_avg) return grad_avg
def build_and_run_model(): def my_net(x): return gen_sendrecv_ops.ipu_send_to_host( x, tensor_name="test_tensor", send_device="/device:IPU:0", send_device_incarnation=0, recv_device="/device:CPU:0") v = array_ops.placeholder(np.float32, shape=()) with ipu.scopes.ipu_scope("/device:IPU:0"): send_op = ipu.ipu_compiler.compile(my_net, inputs=[v]) with ops.device("/device:CPU:0"): recv_op = gen_sendrecv_ops.ipu_recv_at_host( T=np.float32, tensor_name="test_tensor", send_device="/device:IPU:0", send_device_incarnation=0, recv_device="/device:CPU:0") with session.Session() as sess: report = ReportJSON(self, sess) _, received = sess.run([send_op, recv_op], feed_dict={v: 1.0}) events = report.get_event_trace(sess) return received, events
def get_all_update_ops(self, grad_apply_finished, worker_device=None): """ Create and return new update ops for proxy vars. Args: grad_apply_finished (List[Operation]): ops with which to colocate the new ops. worker_device (DeviceSpecV2): the device on which to create the ops. Returns: List[Operation]: the list of update ops for each proxy variable. """ with ops.device(worker_device): with ops.control_dependencies(grad_apply_finished): updated_value = gen_read_var_op( self._this_op, self._dtype) # create new read var op update_ops = [] for proxy_var in self._proxy_vars: with ops.device(proxy_var.device): update_ops.append(proxy_var.assign(updated_value)) return update_ops
def _MyNet(): with variable_scope.variable_scope("vs", use_resource=True): with ops.device("cpu"): inp = array_ops.placeholder(np.float32, [1] + [24] * ndims + [M * K], name="input") bias = array_ops.placeholder(np.float32, [N * K], name="bias") with ops.device("/device:IPU:0"): weights = variable_scope.get_variable("weights", [8] * ndims + [M, N * K]) output = nn.convolution(inp, weights, strides=[1] + [4] * ndims + [1], padding="VALID", name='cnv') output = nn.bias_add(output, bias, name='bias_add') loss = math_ops.reduce_sum(math_ops.square(output)) optimizer = gradient_descent.GradientDescentOptimizer(0.0005) train = optimizer.minimize(loss) return train, loss, inp, bias
def get_training_loss_and_op(self, compiled_training_loop): with ops.device(_HOST_DEVICE): with ops.control_dependencies([compiled_training_loop]): loss = self._outfeed_queue.dequeue() # Reduce loss over all dimensions (i.e. batch_size, gradient_accumulation_count) loss = math_ops.reduce_mean(math_ops.cast(loss, dtypes.float32)) train_op = compiled_training_loop return loss, train_op
def begin(self): if not self._outfeed.enqueued: raise RuntimeError("This logging hook's outfeed was not enqueued. " "Did you forget to call the log function?") assert self._dequeue_op is None assert self._deleter_op is None with ops.device("cpu"): self._dequeue_op = self._outfeed.dequeue() self._deleter_op = self._outfeed.deleter self._iter_count = 0
def get_predictions(self, compiled_prediction_loop): with ops.device(_HOST_DEVICE): with ops.control_dependencies([compiled_prediction_loop]): predictions = self._outfeed_queue.dequeue() if isinstance(predictions, dict): return predictions assert isinstance(predictions, list) if len(predictions) != 1: raise ValueError(( "The last computational stage must return exactly one prediction " "tensor, but got {}").format(len(predictions))) return predictions[0]
def _aggregate_sparse_gradients(self, var_op, reduce_to_device, indexed_slices_grads, values_op_name): with ops.device(reduce_to_device): grad_accum_op_name = ops.prepend_name_scope( values_op_name, u"%sAccum" % AUTODIST_PREFIX) grad_accum = data_flow_ops.SparseConditionalAccumulator( dtype=indexed_slices_grads[0].values.dtype, shape=var_op.outputs[0].shape, shared_name=grad_accum_op_name, name=grad_accum_op_name) accum_apply_ops = [ grad_accum.apply_indexed_slices_grad( indexed_slices_grads[i], MAX_INT64, name=ops.prepend_name_scope( values_op_name, u"%s-Accum-Apply" % replica_prefix(i))) for i in range(self.num_replicas) ] take_grad_op_name = ops.prepend_name_scope( values_op_name, u"%sTake-Grad" % AUTODIST_PREFIX) with ops.control_dependencies(accum_apply_ops): take_grad = grad_accum.take_indexed_slices_grad( self.num_replicas, name=take_grad_op_name) new_indices = take_grad.indices new_values = take_grad.values new_dense_shape = take_grad.dense_shape if indexed_slices_grads[0].indices.dtype != new_indices.dtype: new_indices = math_ops.cast( new_indices, indexed_slices_grads[0].indices.dtype, name=ops.prepend_name_scope( values_op_name, u"%sTake-Grad-Cast-Indices" % AUTODIST_PREFIX)) if indexed_slices_grads[ 0].dense_shape.dtype != new_dense_shape.dtype: new_dense_shape = math_ops.cast( new_dense_shape, indexed_slices_grads[0].dense_shape.dtype, name=ops.prepend_name_scope( values_op_name, u"%sTake-Grad-Cast-Shape" % AUTODIST_PREFIX)) return ops.IndexedSlices(new_values, new_indices, new_dense_shape)
def get_evaluation_loss_and_metrics(self, compiled_evaluation_loop): with ops.device(_HOST_DEVICE): with ops.control_dependencies([compiled_evaluation_loop]): inputs = self._outfeed_queue.dequeue() args, kwargs = loops._body_arguments(inputs) # pylint: disable=protected-access metrics = self._captured_eval_metrics_fn(*args, **kwargs) if not isinstance(metrics, dict): raise TypeError(("The `eval_metrics_fn` must return a dict, " "but got {}.").format(type(metrics))) if model_fn_lib.LOSS_METRIC_KEY not in metrics: raise KeyError( ("The dict returned from `eval_metrics_fn` " "must contain '{}'.").format(model_fn_lib.LOSS_METRIC_KEY)) loss = metrics.pop(model_fn_lib.LOSS_METRIC_KEY) return loss, metrics
def _build_proxy_on(self, destination_device): """ Build a proxy of the original variable on `destination_device`. Args: destination_device (DeviceSpecV2): the destination device where the proxy is on. """ is_gpu = destination_device.device_type.upper( ) == 'GPU' if destination_device.device_type else False prefix = replica_prefix(destination_device.device_index ) if is_gpu else replica_prefix('CPU') with ops.device(destination_device): proxy_var = variable_scope.get_variable( ops.prepend_name_scope(self._this_op.name, prefix), dtype=self._dtype, initializer=self._initial_value, trainable=False) self._graph_item.info.update_variables( [proxy_var], replace=False) # Should we update graph_item.info? self._proxy_vars.append(proxy_var) self._proxy_var_init_ops.append( proxy_var.assign(get_read_var_tensor(self._this_op))) self._mirror_all_read_var_ops() self._update_all_consumers()
def replicate(self, graph_item): """ Replicate the entire graph as many times as num_replica. Args: graph_item: the original graph item Returns: The new graph item """ item = GraphItem(graph=ops.Graph()) fwd_ctx, bwd_ctx = self._collect_while_context(graph_item.graph) with item.graph.as_default(): gdef = graph_item.graph.as_graph_def() for i in range(self._num_local_replicas): # Replicate ops with ops.device(self._replica_device_placer(replica_id=i)): import_graph_def(gdef, name=replica_prefix(i)) # Replicate while_loop context (control_flow) if needed. # The order matters -- We must replicate bwd context first, then forward context. # TODO(Zeya): To handle cases when there are nested while loops, in which we must replicate # parent context first and then child context. See: # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/control_flow_ops.py#L938 if bwd_ctx: for ctx in bwd_ctx: _ = WhileContext(context_def=ctx.to_proto(), grad_state=ctx._grad_state, import_scope=replica_prefix(i)) if fwd_ctx: for ctx in fwd_ctx: _ = WhileContext(context_def=ctx.to_proto(), grad_state=ctx._grad_state, import_scope=replica_prefix(i)) # update saver master_replica = 0 if graph_item.info.savers: item.info.update_savers( [Saver.from_proto(proto, import_scope=replica_prefix(master_replica)).to_proto() for proto in graph_item.info.savers], replace=False ) # update gradient info for i in range(self._num_local_replicas): for g_name, t_name in graph_item.grad_target_name_pairs.items(): if isinstance(g_name, tuple): new_g_name = ( ops.prepend_name_scope(g_name[0], replica_prefix(i)), ops.prepend_name_scope(g_name[1], replica_prefix(i)), ops.prepend_name_scope(g_name[2], replica_prefix(i))) else: new_g_name = ops.prepend_name_scope(g_name, replica_prefix(i)) new_t_name = ops.prepend_name_scope(t_name, replica_prefix(i)) item.extend_gradient_info_by_names( grads=[new_g_name], targets=[new_t_name] ) item.info.update_variables( [_from_proto_fn(proto, import_scope=replica_prefix(i)).to_proto() for proto in graph_item.info.variables], replace=False ) item.info.update_table_initializers( [ops.prepend_name_scope(tb_init, replica_prefix(i)) for tb_init in graph_item.info.table_initializers], replace=False ) return item
def _get_accumulation_ops(graph_item, gradient, target, num_accum_required): def _get_accum_apply_and_agg_grad(var_op, grad, indices, dense_shape): if indices is None: tensor = variable_utils.get_read_var_tensor(var_op) grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=tensor.get_shape(), shared_name=var_op.name + "/grad_accum") # Get a copy of consumers list before creating accum_apply_op grad_consumers = list(grad.consumers()) accum_apply_op = grad_accum.apply_grad(grad, local_step=MAX_INT64, name=grad.op.name + '_accum_apply_grad') agg_grad = grad_accum.take_grad(num_accum_required, name=var_op.name + '_take_grad') update_consumers(grad_consumers, grad, agg_grad) update_control_consumers(get_control_consumers(grad.op), grad.op, agg_grad.op) else: grad_indexed_slices = ops.IndexedSlices( values=grad, indices=indices, dense_shape=dense_shape) grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=grad.shape, shared_name=var_op.name + "/grad_accum") # Get a copy of consumers list before creating accum_apply_op indices_consumers = list(indices.consumers()) grad_consumers = list(grad.consumers()) accum_apply_op = grad_accum.apply_indexed_slices_grad( grad_indexed_slices, local_step=MAX_INT64, name=grad.op.name + '_accum_apply_grad') agg_grad = grad_accum.take_indexed_slices_grad( num_accum_required, name=var_op.name + '_take_grad') agg_indices = agg_grad.indices if indices.dtype != agg_grad.indices.dtype: agg_indices = math_ops.cast(agg_grad.indices, indices.dtype) agg_grad = ops.IndexedSlices(values=agg_grad.values, indices=agg_indices, dense_shape=agg_grad.dense_shape) assert isinstance(agg_grad, ops.IndexedSlices) update_consumers(indices_consumers, indices, agg_grad.indices) update_consumers(grad_consumers, grad, agg_grad.values) update_control_consumers(get_control_consumers(indices.op), indices.op, agg_grad.indices.op) update_control_consumers(get_control_consumers(grad.op), grad.op, agg_grad.values.op) return accum_apply_op, agg_grad # Aggregate gradients from different workers using ConditionalAccumulator. # var_op_to_agg_grad and var_op_to_accum_apply_op are updated. var_op_to_agg_grad = {} var_op_to_accum_apply_op = {} if target.op not in graph_item.trainable_var_op_to_var: logging.debug( "Gradient for non-trainable variable %s is created, " "do not insert accumulator for aggregating this gradient" % target.op.name) return {}, {} var_op = target.op if isinstance(gradient, ops.Tensor): grad = gradient indices = None dense_shape = None else: grad = gradient.values indices = gradient.indices dense_shape = gradient.dense_shape with ops.device(var_op.device), ops.name_scope(""): accum_apply_op, agg_grad = _get_accum_apply_and_agg_grad( var_op, grad, indices, dense_shape) if indices is None: var_op_to_agg_grad[var_op] = (None, agg_grad) else: var_op_to_agg_grad[var_op] = (agg_grad.indices, agg_grad.values) var_op_to_accum_apply_op[var_op] = accum_apply_op return var_op_to_agg_grad, var_op_to_accum_apply_op
def add_sync_op(self, graph_item, var_update_op, variable_replicator=None): """ Adds additional ops needed for synchronous distributed training into current graph. Main purpose of additional ops are: 1. Initialization 2. Synchronization 3. Gradient aggregation Args: graph_item (graph_item.GraphItem): the graph var_update_op: The op variable_replicator: The dictionary of master variable op name -> list of replicated variables, could be None Returns: None """ this_worker_cpu = device_spec.DeviceSpecV2.from_string( self.worker_device) this_worker_cpu = this_worker_cpu.replace(device_type='CPU', device_index=0) var_op = var_update_op.inputs[UPDATE_OP_VAR_POS].op is_trainable = var_op in graph_item.trainable_var_op_to_var source_op = self._get_optimizer_source_op(var_update_op) cc = get_control_consumers(source_op) with ops.device(var_op.device): if self._staleness == 0: queue_ops = self._get_queue_ops(var_update_op, source_op, self.is_chief, is_trainable) elif self._staleness > 0: queue_ops = self._get_queue_ops_stale(var_update_op, source_op, self.is_chief, is_trainable) else: raise ValueError( "staleness should be greater than or equal to 0.") # Only dense trainable variables are replicated locally if variable_replicator: mirror_variable_update_ops = variable_replicator.get_all_update_ops( queue_ops, worker_device=this_worker_cpu) with ops.device(this_worker_cpu): finish_op = control_flow_ops.group( *mirror_variable_update_ops) else: finish_op = control_flow_ops.group(*queue_ops) # Place computation ops of aggregated gradients on PS # Note that even though this is doing a graph traversal, it is called in such a way that it # only traverses from a gradient aggregator op to a gradient application op (or vice versa) -- # these corresponding ops should always be adjacent in the graph. self._place_post_grad_agg_ops( device_spec.DeviceSpecV2.from_string(self.target_device), self._var_op_to_agg_grad, {var_op: var_update_op} if is_trainable else {}) # Replace the control input of train_op to be finish_op # Note(Hao): this cc is stale, i.e. cc \subset get_control_consumers(source_op) update_control_consumers(cc, source_op, finish_op)
def test_pipelining(self): gradient_accumulation_count = 4 local_batch_size = 2 features = np.ones((1, 20), dtype=np.float32) * hvd.rank() labels = np.ones(1, dtype=np.int32) * hvd.rank() dataset = dataset_ops.Dataset.from_tensor_slices((features, labels)) dataset = dataset.repeat().batch(local_batch_size, drop_remainder=True) loss_vals = [] strategy = IPUHorovodStrategy() with strategy.scope(): infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "infeed") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("outfeed") def stage1(lr, images, labels): partial = keras.layers.Dense(32, activation="relu")(images) partial = keras.layers.Dense(16, activation="relu")(partial) return lr, partial, labels def stage2(lr, partial, labels): logits = keras.layers.Dense(10)(partial) per_example_loss = keras.losses.sparse_categorical_crossentropy( y_true=labels, y_pred=logits, from_logits=True) # In a custom training loop, the optimiser does an allreduce *sum*, not # average, of the gradients across the distributed workers. Therefore # we want to divide the loss here by the *global* batch size, which is # done by the `tf.nn.compute_average_loss()` function. loss = nn.compute_average_loss(per_example_loss) return lr, loss def optimizer_function(lr, loss): optimizer = GradientDescentOptimizer(lr) return pipelining_ops.OptimizerFunctionOutput(optimizer, loss) def model(lr): pipeline_op = pipelining_ops.pipeline( computational_stages=[stage1, stage2], device_mapping=[0, 0], gradient_accumulation_count=gradient_accumulation_count, inputs=[lr], infeed_queue=infeed_queue, repeat_count=2, outfeed_queue=outfeed_queue, optimizer_function=optimizer_function, name="Pipeline") return pipeline_op def compiled_model(lr): with ipu_scope("/device:IPU:0"): return ipu_compiler.compile(model, inputs=[lr]) with ops.device("cpu"): lr = array_ops.placeholder(np.float32, []) train_op = strategy.experimental_run_v2(compiled_model, args=[lr]) _, per_worker_losses = outfeed_queue.dequeue() # Mean across the local `gradient_accumulation_count` batches: per_worker_loss = math_ops.reduce_mean(per_worker_losses) # Global mean across the distributed workers (since it is already # divided by the global batch size above, we do a sum here): global_loss = strategy.reduce(ReduceOp.SUM, per_worker_loss) config = ipu_utils.create_ipu_config() config = ipu_utils.auto_select_ipus(config, num_ipus=1) ipu_utils.configure_ipu_system(config) ipu_utils.move_variable_initialization_to_cpu() with session.Session() as sess: sess.run(infeed_queue.initializer) sess.run(variables.global_variables_initializer()) for _ in range(10): sess.run(train_op, {lr: 0.01}) global_loss_val = sess.run(global_loss) if loss_vals: # Check that the loss decreases monotonically. self.assertLess(global_loss_val, loss_vals[-1]) loss_vals.append(global_loss_val) sess.run(infeed_queue.deleter) sess.run(outfeed_queue.deleter) # Check all variables are equal across workers. for variable in variables.global_variables(): self.assertAllRanksEqual(variable.eval(), variable.name)