def basic_pipelined_training_step(model, opts, learning_rate, infeed, outfeed, iterations_per_step=1): def first_stage(learning_rate, image, label, pipeline_stage=None): return learning_rate, pipeline_stage(image), label, def final_stage(learning_rate, x, label, pipeline_stage=None): x = pipeline_stage(x) loss, cross_entropy, accuracy = calculate_loss(x, label, opts) return loss, cross_entropy, accuracy, learning_rate / opts["lr_scale"] model_stages = model(opts) computational_stages = [ partial(first_stage, pipeline_stage=model_stages[x]) for x in range(len(model_stages) - 1) ] computational_stages.append( partial(final_stage, pipeline_stage=model_stages[-1])) def optimizer_function(loss, _, __, lr): optimizer = get_optimizer(opts)(lr) return pipelining_ops.OptimizerFunctionOutput( optimizer, loss * opts["loss_scaling"]) options = None amps = opts['available_memory_proportion'] if amps and len(amps) > 1: # Map values to the different pipeline stages options = [] for i in range(len(amps) // 2): options.append( pipelining_ops.PipelineStageOptions( {"availableMemoryProportion": amps[2 * i]}, {"availableMemoryProportion": amps[2 * i + 1]})) return pipelining_ops.pipeline( computational_stages=computational_stages, pipeline_depth=int(opts['pipeline_depth']), repeat_count=iterations_per_step, inputs=[learning_rate], infeed_queue=infeed, outfeed_queue=outfeed, optimizer_function=optimizer_function, forward_propagation_stages_poplar_options=options, backward_propagation_stages_poplar_options=options, pipeline_schedule=next( p for p in list(pipelining_ops.PipelineSchedule) if opts["pipeline_schedule"] == str(p).split(".")[-1]), offload_weight_update_variables=False if opts['disable_variable_offloading'] else True, name="Pipeline")
def infer(lr, infeed, outfeed, gradient_accumulation_count): pipeline_op = pipelining_ops.pipeline( self.computational_stages, gradient_accumulation_count=gradient_accumulation_count, gradient_accumulation_dtype=self.dtype, inputs=[lr], infeed_queue=infeed, outfeed_queue=outfeed, device_mapping=self.device_mapping) return pipeline_op
def model(lr): pipeline_op = pipelining_ops.pipeline( computational_stages=[stage1, stage2], device_mapping=[0, 0], gradient_accumulation_count=gradient_accumulation_count, inputs=[lr], infeed_queue=infeed_queue, repeat_count=2, outfeed_queue=outfeed_queue, optimizer_function=optimizer_function, name="Pipeline") return pipeline_op
def prediction_pipeline(): spec = self._call_model_fn(model_fn_lib.ModeKeys.PREDICT) return pipelining_ops.pipeline( infeed_queue=self._infeed_queue, outfeed_queue=self._outfeed_queue, computational_stages=spec.computational_stages, gradient_accumulation_count=spec.gradient_accumulation_count, repeat_count=self._config.ipu_run_config.iterations_per_loop, inputs=spec.inputs, device_mapping=spec.device_mapping, pipeline_schedule=spec.pipeline_schedule, name="ipu_pipeline_estimator_predict")
def train(lr, infeed, outfeed, gradient_accumulation_count): pipeline_op = pipelining_ops.pipeline( self.computational_stages, gradient_accumulation_count=gradient_accumulation_count, gradient_accumulation_dtype=self.dtype, inputs=[lr], infeed_queue=infeed, outfeed_queue=outfeed, device_mapping=self.device_mapping, optimizer_function=self.optimizer_function, offload_weight_update_variables=False) return pipeline_op
def training_pipeline(): spec = self._call_model_fn(model_fn_lib.ModeKeys.TRAIN) return pipelining_ops.pipeline( infeed_queue=self._infeed_queue, outfeed_queue=self._outfeed_queue, computational_stages=spec.computational_stages, gradient_accumulation_count=spec.gradient_accumulation_count, repeat_count=self._config.ipu_run_config.iterations_per_loop, inputs=spec.inputs, optimizer_function=spec.optimizer_function, device_mapping=spec.device_mapping, pipeline_schedule=spec.pipeline_schedule, outfeed_loss=True, offload_weight_update_variables=spec. offload_weight_update_variables, name="ipu_pipeline_estimator_train")
def evaluation_pipeline(): spec = self._call_model_fn(model_fn_lib.ModeKeys.EVAL) assert not self._captured_eval_metrics_fn assert spec.eval_metrics_fn self._captured_eval_metrics_fn = spec.eval_metrics_fn return pipelining_ops.pipeline( infeed_queue=self._infeed_queue, outfeed_queue=self._outfeed_queue, computational_stages=spec.computational_stages, gradient_accumulation_count=spec.gradient_accumulation_count, repeat_count=self._config.ipu_run_config.iterations_per_loop, inputs=spec.inputs, device_mapping=spec.device_mapping, pipeline_schedule=spec.pipeline_schedule, name="ipu_pipeline_estimator_eval")
def basic_pipelined_training_step(model, opts, learning_rate, infeed, outfeed, iterations_per_step=1): def first_stage(learning_rate, image, label, pipeline_stage=None): return learning_rate, pipeline_stage(image), label, def final_stage(learning_rate, x, label, pipeline_stage=None): x = pipeline_stage(x) loss, cross_entropy, accuracy = calculate_loss(x, label, opts) return learning_rate, loss, cross_entropy, accuracy model_stages = model(opts) computational_stages = [ partial(first_stage, pipeline_stage=model_stages[x]) for x in range(len(model_stages) - 1) ] computational_stages.append( partial(final_stage, pipeline_stage=model_stages[-1])) def optimizer_stage(lr, loss, cross_entropy, accuracy): grads_and_vars = calculate_gradients( loss, opts["weight_decay"] * opts['loss_scaling'], opts) optimizer = get_optimizer(opts)(lr) apply_grads = optimizer.apply_gradients(grads_and_vars=grads_and_vars) return loss / opts["loss_scaling"], cross_entropy, accuracy, lr * opts[ 'loss_scaling'], lr, apply_grads return pipelining_ops.pipeline( computational_stages=computational_stages, pipeline_depth=int(opts['pipeline_depth']), repeat_count=iterations_per_step, inputs=[learning_rate / opts['loss_scaling']], infeed_queue=infeed, outfeed_queue=outfeed, optimizer_stage=optimizer_stage, pipeline_schedule=next( p for p in list(pipelining_ops.PipelineSchedule) if opts["pipeline_schedule"] == str(p).split(".")[-1]), name="Pipeline")
def basic_pipelined_training_step(model, opts, learning_rate, infeed, outfeed, iterations_per_step=1): def first_stage(learning_rate, image, label, pipeline_stage=None): return learning_rate, pipeline_stage(image), label, def final_stage(learning_rate, x, label, pipeline_stage=None): x = pipeline_stage(x) loss, cross_entropy, accuracy = calculate_loss(x, label, opts) return loss, cross_entropy, accuracy, learning_rate / opts[ "loss_scaling"] model_stages = model(opts) computational_stages = [ partial(first_stage, pipeline_stage=model_stages[x]) for x in range(len(model_stages) - 1) ] computational_stages.append( partial(final_stage, pipeline_stage=model_stages[-1])) def optimizer_function(loss, _, __, lr): optimizer = get_optimizer(opts)(lr) return pipelining_ops.OptimizerFunctionOutput( optimizer, loss * opts["loss_scaling"]) return pipelining_ops.pipeline( computational_stages=computational_stages, pipeline_depth=int(opts['pipeline_depth']), repeat_count=iterations_per_step, inputs=[learning_rate], infeed_queue=infeed, outfeed_queue=outfeed, optimizer_function=optimizer_function, pipeline_schedule=next( p for p in list(pipelining_ops.PipelineSchedule) if opts["pipeline_schedule"] == str(p).split(".")[-1]), name="Pipeline")
def _internal_run_loop(self, infeed_queue, outfeed_queue, repeat_count, mode): training = mode == ModeKeys.TRAIN # Dictionary mapping reference tensors to computed tensors. tensor_dict = OrderedDict() def get_inputs_and_targets(*args): args = nest.flatten(args) num_inputs = len(self.inputs) inputs = list(args[:num_inputs]) targets = list(args[num_inputs:]) assert len(inputs) == num_inputs # "Execute" the input layers executed_inputs = [] for op, layer, tensor in zip(self.inputs, self._input_layers, inputs): executed_inputs.append(layer(tensor)) tensor_dict[str(id(op))] = executed_inputs[-1] if isinstance(op, ops.Tensor) and isinstance( tensor, ops.Tensor): try: tensor.set_shape(tensor.shape.merge_with(op.shape)) except ValueError: logging.warning( 'Model was constructed with shape {} for input {}, but it ' 'was re-called on a Tensor with incompatible ' 'shape {}.'.format(op, op.shape, tensor.shape)) return executed_inputs, targets def main_body(stage_id, *args): if stage_id == self.stages[0]: inputs, targets = get_inputs_and_targets(*args) else: inputs = list(args[:len(tensor_dict)]) targets = list(args[len(inputs):]) # Update the tensor dict with the inputs. for idx, k in enumerate(tensor_dict): tensor_dict[k] = inputs[idx] for i in self._stage_node_ids[stage_id]: node = self._post_order_node_execution[len(self.inputs) + i] if node._pipeline_stage == stage_id: # pylint: disable=protected-access self._execute_layer_node(node, training, tensor_dict) # pylint: disable=protected-access if stage_id == self.stages[-1]: return self._get_output_tensors(tensor_dict) # pylint: disable=protected-access return list(tensor_dict.values()) + targets def inference_body(stage_id, *args): return main_body(stage_id, *args) def training_body(stage_id, *args): x = main_body(stage_id, *args) if stage_id == self.stages[-1]: self._set_output_attrs(x) targets = args[-len(self.outputs)] return self._add_loss(targets) return x def optimizer_function(loss, *_): if not self.trainable_weights: raise ValueError( "Model must have at least one trainable parameter.") opt = self._get_optimizer() return pipelining_ops.OptimizerFunctionOutput(opt, loss) # The pipeline stages, a set of feed forward functions. if mode == ModeKeys.PREDICT: stage_fn = inference_body else: stage_fn = training_body stages = [] for stage in self.stages: stages.append(partial(stage_fn, stage)) opt = optimizer_function if training else None pipeline = pipelining_ops.pipeline( stages, gradient_accumulation_count=self.gradient_accumulation_count, repeat_count=repeat_count, inputs=[], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, optimizer_function=opt, device_mapping=self.device_mapping, pipeline_schedule=self.pipeline_schedule, forward_propagation_stages_poplar_options=self. forward_propagation_stages_poplar_options, backward_propagation_stages_poplar_options=self. backward_propagation_stages_poplar_options, weight_update_poplar_options=self.weight_update_poplar_options, replicated_optimizer_state_sharding=self. replicated_optimizer_state_sharding, offload_activations=self.offload_activations, offload_gradient_accumulation_buffers=self. offload_gradient_accumulation_buffers, replicated_weight_sharding=self.replicated_weight_sharding, offload_weights=self.offload_weights, name=self.name, **self.args) return pipeline.outputs
def _internal_run_loop(self, infeed_queue, outfeed_queue, repeat_count, mode): training = mode == ModeKeys.TRAIN # Plain functions to build a stage def call_inference_stage(stage_id, inputs): # Record the inputs of the first stage if stage_id == 0 and not self.inputs: self._set_input_attrs(inputs) x = inputs for l in self.stages[stage_id]: kwargs = {} argspec = tf_inspect.getfullargspec(l.call).args if 'training' in argspec: kwargs['training'] = training x = l(x, **kwargs) return x def call_training_stage(stage_id, inputs, targets): x = call_inference_stage(stage_id, inputs) # Recompile the model now that we know the inputs and outputs, and # then create the losses and metrics if stage_id == len(self.stages) - 1: self._set_output_attrs(x) return self._add_loss(targets) return x, targets # Function for generating the optimizer config for pipelines. def optimizer_function(loss, *_): if not self.trainable_weights: raise ValueError( "Model must have at least one trainable parameter.") opt = self._get_optimizer() return pipelining_ops.OptimizerFunctionOutput(opt, loss) # The pipeline stages, a set of feed forward functions. if mode == ModeKeys.PREDICT: stage_fn = call_inference_stage else: stage_fn = call_training_stage stages = [] for stage_id in range(len(self.stages)): stages.append(partial(stage_fn, stage_id)) opt = optimizer_function if training else None pipeline = pipelining_ops.pipeline( stages, gradient_accumulation_count=self.gradient_accumulation_count, repeat_count=repeat_count, inputs=[], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, optimizer_function=opt, device_mapping=self.device_mapping, pipeline_schedule=self.pipeline_schedule, forward_propagation_stages_poplar_options=self. forward_propagation_stages_poplar_options, backward_propagation_stages_poplar_options=self. backward_propagation_stages_poplar_options, weight_update_poplar_options=self.weight_update_poplar_options, replicated_optimizer_state_sharding=self. replicated_optimizer_state_sharding, offload_activations=self.offload_activations, offload_gradient_accumulation_buffers=self. offload_gradient_accumulation_buffers, replicated_weight_sharding=self.replicated_weight_sharding, offload_weights=self.offload_weights, name=self.name, **self.args) return pipeline.outputs
def build_network(infeed, outfeed, iterations_per_step=1, bert_config=None, opts=None, learning_rate=None, is_training=True): # build model if opts["groupbert"]: logger.info( f"************* Using GroupBERT model architecture *************") pipeline_model = bert_ipu.GroupBertModel(bert_config, is_training=is_training) else: pipeline_model = bert_ipu.BertModel(bert_config, is_training=is_training) # build stages & device mapping computational_stages = build_squad_pipeline_stages(pipeline_model, bert_config, opts, is_training) device_mapping = opts['device_mapping'] logger.info( f"************* computational stages: *************\n{computational_stages}" ) logger.info( f"************* device mapping: *************\n{device_mapping}") # Set IPU-specific available memory proportion if isinstance(opts['available_memory_proportion'], float): available_memory_proportion_list = [ str(opts['available_memory_proportion']) ] * len(device_mapping) else: available_memory_proportion_list = [ str(opts['available_memory_proportion'][device]) for device in device_mapping ] if len(available_memory_proportion_list) != len(device_mapping): raise ValueError( "The available_memory_proportion list must be the same length as the number of stages in the pipeline." ) options = [ ipu.pipelining_ops.PipelineStageOptions( matmul_options={ "availableMemoryProportion": str(opts["available_memory_proportion"]), "partialsType": opts["partials_type"] }, convolution_options={"partialsType": opts["partials_type"]}) ] * len(computational_stages) if is_training: # define optimizer def optimizer_function(learning_rate, total_loss, *args): optimizer = get_optimizer(learning_rate, opts['loss_scaling'], opts['replicas'], opts) if opts["replicas"] > 1: optimizer = ipu.optimizers.cross_replica_optimizer.CrossReplicaOptimizer( optimizer) return pipelining_ops.OptimizerFunctionOutput( optimizer, total_loss * opts['loss_scaling']) return pipelining_ops.pipeline( computational_stages=computational_stages, gradient_accumulation_count=opts['gradient_accumulation_count'], repeat_count=iterations_per_step, inputs=[learning_rate], infeed_queue=infeed, outfeed_queue=outfeed, device_mapping=device_mapping, forward_propagation_stages_poplar_options=options, backward_propagation_stages_poplar_options=options, offload_weight_update_variables=opts['variable_offloading'], optimizer_function=optimizer_function, recomputation_mode=ipu.ops.pipelining_ops.RecomputationMode[ opts['recomputation_mode']], name="Pipeline") else: return pipelining_ops.pipeline( computational_stages=computational_stages, gradient_accumulation_count=opts['gradient_accumulation_count'], repeat_count=iterations_per_step, inputs=[], infeed_queue=infeed, outfeed_queue=outfeed, device_mapping=device_mapping, forward_propagation_stages_poplar_options=options, backward_propagation_stages_poplar_options=options, offload_weight_update_variables=opts['variable_offloading'], name="Pipeline")