def bound_train_loop(lr): stages, optimizer_fn = make_stages( fc_layers, opts.droprate, opt_cls, opt_kws, training=True, disable_dense_grad=disable_dense_grad, iterations_per_dense_grad=train_batches_per_step, png_queue=png_queue) return pipelining_ops.pipeline( computational_stages=stages, gradient_accumulation_count=opts.gradient_accumulation_count, repeat_count=train_batches_per_step, inputs=[lr], device_mapping=[0, 0], infeed_queue=infeed_train_queue, outfeed_queue=outfeed_train_queue, optimizer_function=optimizer_fn, offload_weight_update_variables=False, outfeed_loss=False, pipeline_schedule=next(p for p in pipelining_ops.PipelineSchedule if opts.pipeline_schedule == p.name), name="Pipeline_Train")
def model_func(self, model, opts, global_step_holder, infeed_queue, outfeed_queue): computational_stages = self.build_pretrain_pipeline_stages(model, opts) options = [ ipu.pipelining_ops.PipelineStageOptions( matmul_options={ "availableMemoryProportion": str(0.2), "partialsType": "half" }, convolution_options={"partialsType": "half"}) ] * len(opts["train"]["device_mapping"]) # we write this wrapper because self.optimizer_func has "self" as it's parameter # it will cause an error when cal ipu_compiler.compile def optimizer_wrapper(giou_loss, conf_loss, prob_loss, lr): return self.optimize_func(giou_loss, conf_loss, prob_loss, lr) pipeline_op = pipelining_ops.pipeline( computational_stages=computational_stages, gradient_accumulation_count=opts["train"]["pipeline_depth"], repeat_count=self.repeat_count, optimizer_function=optimizer_wrapper, inputs=[global_step_holder], forward_propagation_stages_poplar_options=options, backward_propagation_stages_poplar_options=options, infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, offload_activations=False, offload_gradient_accumulation_buffers=False, offload_weight_update_variables=False, device_mapping=opts["train"]["device_mapping"], name="Pipeline") return pipeline_op
def model_pipeline(x, lr): return pipelining_ops.pipeline( [stage1, stage2, stage3], 12, inputs=[x, lr], outfeed_queue=outfeed_queue, optimizer_function=optimizer_function)
def my_net(x): return pipelining_ops.pipeline( [stage1, stage2], 10, inputs=[x], optimizer_function=optimizer_function, pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)
def model_pipeline(x, y): return pipelining_ops.pipeline( [stage1, stage2, stage3], 12, inputs=[x, y], outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)
def model_pipeline(x): return pipelining_ops.pipeline( [stage1, stage2], 10, inputs=[x], outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped)
def my_net(c): return pipelining_ops.pipeline( [stage1, stage2, stage3], 12, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)
def my_net(c): return pipelining_ops.pipeline( [stage1, stage2, stage3], 12, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, device_mapping=device_mapping, pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped)
def my_net(*args): return pipelining_ops.pipeline( stages, gradient_accumulation_count, repeat_count=repeat_count, batch_serialization_iterations= batch_serialization_iterations, inputs=args, optimizer_function=optimizer_function, infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, pipeline_schedule=schedule, device_mapping=device_mapping)
def bound_test_loop(): stages, _ = make_stages( fc_layers, opts.droprate, opt_cls, opt_kws, training=False, disable_dense_grad=disable_dense_grad, iterations_per_dense_grad=test_batches_per_step, png_queue=png_queue) return pipelining_ops.pipeline( computational_stages=stages, gradient_accumulation_count=opts.gradient_accumulation_count, repeat_count=test_batches_per_step, inputs=tf.Variable(initial_value=0.0, name="dummy_lr"), device_mapping=[0, 0], infeed_queue=infeed_test_queue, outfeed_queue=outfeed_test_queue, optimizer_function=None, outfeed_loss=False, pipeline_schedule=next(p for p in pipelining_ops.PipelineSchedule if opts.pipeline_schedule == p.name), name="Pipeline_Validation")
def forward_pass(opts, transformer, iterations_per_step, is_training, outfeed, dense_queue, infeed): def make_counter(): with tf.variable_scope("counter", reuse=tf.AUTO_REUSE, use_resource=True): itr_counter = tf.get_variable("iterations", [], tf.int32, trainable=False) increment_counter = tf.assign_add(itr_counter, 1) mod_itrs = tf.math.floormod(increment_counter, iterations_per_step) last_itr = tf.equal(mod_itrs, 0, name="last_update_itr") # Add accumulation counter if pipelined if opts.pipeline: grad_counter = internal_ops.get_current_iteration_counter() last_grad_itr = tf.equal(grad_counter, opts.gradient_accumulation_count - 1, name="last_grad_itr") last_itr = tf.logical_and(last_itr, last_grad_itr, name="last_itr") return last_itr def make_src_mask(last_itr, source): with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE, use_resource=True): transformer.compute_dense_grad = last_itr autoregressive_mask = tf.constant( np.triu(np.ones([S, S], dtype=np.bool), k=1)) source_mask = autoregressive_mask source_mask = tf.cast(source_mask, opts.dtype) * -10000 return source_mask def loss_and_metrics(logits, source): with tf.variable_scope("metrics", reuse=tf.AUTO_REUSE, use_resource=True): # Implement autoregressice loss through teacher forcing # The first few tokens have no hope of being correct # so we exclude the first "offset" tokens from the loss offset = opts.autoregression_offset logits = tf.cast(logits[:, offset:-1], tf.float32) # logits always full precision target = source[:, offset + 1:] predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # Accuracy acc, acc_op = tf.metrics.accuracy(target, predictions, name="token_accuracy") # Unweighted cross-entropy for tracking progress nll_loss = tf.losses.sparse_softmax_cross_entropy(labels=target, logits=logits) nll_loss = tf.reduce_mean(nll_loss) perplexity = tf.exp(nll_loss) # Training loss (weighted cross-entropy) # the weight of the loss on each token is normalized by the number of # that token appears in the sequence # For instance if there are 10 padding tokens, the loss from each will have a weight of 1/10 nll_weights = tf.expand_dims(target, -1) nll_weights = tf.equal(nll_weights, tf.transpose(nll_weights, perm=[0, 2, 1])) nll_weights = tf.cast(nll_weights, tf.float32) nll_weights = 1.0 / tf.reduce_sum(nll_weights, -1) training_loss = tf.losses.sparse_softmax_cross_entropy( labels=target, logits=logits, weights=nll_weights) training_loss = tf.reduce_mean(training_loss) return { "training_loss": training_loss, "token_accuracy": acc, "acc_op": acc_op, "nll_loss": nll_loss, "perplexity": perplexity, "predictions": predictions, "target": target } def make_lr_schedule(global_step): with tf.variable_scope("training", reuse=tf.AUTO_REUSE, use_resource=True): # The learning rate schedule needs to be part of the graph so the lr can # change between different batchs within the same io step schedule = tf_utils.BertSchedule(opts, opts.dtype) lr = schedule(global_step) return lr def make_optimizer(lr, last_itr): with tf.variable_scope("training", reuse=tf.AUTO_REUSE, use_resource=True): optimizer_class, optimizer_kwargs = build_optimizer( opts.optimizer, opts.optimizer_arg) optimizer_class = optimizers.SparseOptimizer(optimizer_class) optimizer_class = global_step_update_opt.GlobalStepUpdateOptimizer( optimizer_class) if opts.loss_scale != 1: optimizer_class = scaling_opt.LossScalingOptimizer( optimizer_class) optimizer_kwargs['loss_scale'] = opts.loss_scale optimizer_kwargs[ 'unscale_grad_pre_acc'] = opts.unscale_grad_pre_acc if opts.grad_acculation_mode == 'Avg': optimizer_class = scaling_opt.GradScalingOptimizer( optimizer_class) optimizer_kwargs[ 'grad_scale'] = 1 / opts.gradient_accumulation_count optimizer_kwargs[ 'scale_grad_pre_acc'] = opts.scale_grad_pre_acc if opts.grad_norm_clip: optimizer_class = grad_clip_opt.GradientClippingOptimizer( optimizer_class) optimizer_kwargs['norm_clip_threshold'] = opts.grad_norm_clip if opts.slots_fp_type is not None and tf.as_dtype( opts.slots_fp_type) != opts.dtype: optimizer_class = fp_slot_opt.SelectableSlotFPFormatOptimizer( optimizer_class) optimizer_kwargs['slots_dtype'] = opts.slots_fp_type optimizer_kwargs[ 'force_fp32_weight_update'] = opts.force_fp32_weight_update optimizer = optimizer_class( learning_rate=lr, **optimizer_kwargs, sparse_layers=transformer.sparse_layers.values(), dense_gradient_condition=enable_dense_grad and last_itr, prune_and_grow_outfeed=dense_queue) return optimizer def make_pipeline_opt(outputs): optimizer = make_optimizer(outputs["learning_rate"], outputs["last_itr"]) return pipelining_ops.OptimizerFunctionOutput(optimizer, outputs["training_loss"]) def make_outfeed(lr, global_step, metrics, itr_counter): acc_op = metrics['acc_op'] if is_training: with tf.control_dependencies([acc_op]): output_dict = { **metrics, "learning_rate": lr, "global_step": tf.cast(global_step, tf.int32), "iteration_counter": itr_counter } output = outfeed.enqueue(output_dict) else: # At inference time stream back the loss and accuracy with tf.control_dependencies([acc_op]): output = outfeed.enqueue(metrics) return output # Batch size and sequence length S = transformer.source_sequence_length enable_dense_grad = opts.prune_ratio is not None and opts.prune_ratio > 0 if not opts.pipeline: # This autoregressive model is self-labeling needs only 1 input source = infeed last_itr = make_counter() source_mask = make_src_mask(last_itr, source) # Build the encoder logits = transformer.language_model( source=source, source_mask=source_mask, add_projection_layer=True, last_itr=last_itr, enable_dense_grad=enable_dense_grad, sparse_embeddings=opts.sparse_embeddings) metrics = loss_and_metrics(logits, source) if is_training: global_step = tf.cast(tf.train.get_or_create_global_step(), tf.int32) lr = make_lr_schedule(global_step) optimizer = make_optimizer(lr, last_itr) train_op = optimizer.minimize(metrics['training_loss'], global_step=global_step) else: lr, global_step = None, None train_op = tf.no_op() with tf.control_dependencies([train_op]): with tf.variable_scope("counter", reuse=tf.AUTO_REUSE, use_resource=True): itr_counter = tf.get_variable("iterations", [], tf.int32, trainable=False) output = make_outfeed(lr, global_step, metrics, itr_counter) return output else: def first_stage(global_step, source, input_stage_func): last_itr = make_counter() source_mask = make_src_mask(last_itr, source) return input_stage_func(source, source_mask, last_itr, global_step) def last_stage(encoder_out, source_mask, *args, **kwargs): last_itr = args[0] global_step = args[1] source = args[2] output_stage_func = kwargs['output_stage_func'] logits, *_ = output_stage_func(encoder_out, source_mask, *args) metrics = loss_and_metrics(logits, source) if is_training: metrics.update({ "learning_rate": make_lr_schedule(global_step), "last_itr": last_itr, "global_step": tf.convert_to_tensor(global_step) }) return metrics else: metrics['last_itr'] = last_itr return metrics stages, device_mapping, stage_options = transformer.language_model_stages( enable_dense_grad=enable_dense_grad, sparse_embeddings=opts.sparse_embeddings) stages[0] = partial(first_stage, input_stage_func=stages[0]) stages[-1] = partial(last_stage, output_stage_func=stages[-1]) pipeline_op = pipelining_ops.pipeline( computational_stages=stages, gradient_accumulation_count=opts.gradient_accumulation_count, gradient_accumulation_dtype=opts.gradient_accumulation_dtype, repeat_count=iterations_per_step, inputs=[tf.cast(tf.train.get_or_create_global_step(), tf.int32)], infeed_queue=infeed, outfeed_queue=outfeed, optimizer_function=make_pipeline_opt if is_training else None, device_mapping=device_mapping, offload_activations=opts.offload_activations, offload_gradient_accumulation_buffers=opts. offload_gradient_accumulation_buffers, offload_weight_update_variables=opts. offload_weight_update_variables, forward_propagation_stages_poplar_options=stage_options, backward_propagation_stages_poplar_options=stage_options, name="Pipeline") return pipeline_op
def my_net(x): return pipelining_ops.pipeline( [stage1, stage2], 10, inputs=[x], pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)
def testPipelineInvalidDeviceMapping(self): dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): a = value b = (value + 10.) / 2.0 return {"a": a, "b": b} dataset = dataset.map(dataset_parser) infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed3") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed3") def stage1(c, **kwargs): with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D(2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv1')(kwargs["a"]) return y + kwargs["b"], c def stage2(x, c): return math_ops.reduce_sum(x) + c def stage3(x): return x with ops.device('cpu'): c = array_ops.placeholder(np.float32, shape=[]) # Wrong type: with self.assertRaisesRegex( TypeError, 'device_mapping argument needs to be a list or a tuple'): pipelining_ops.pipeline( [stage1, stage2, stage3], 3, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, device_mapping=1, pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential) # Too many values: with self.assertRaisesRegex(ValueError, 'Each stage must be mapped to an IPU'): pipelining_ops.pipeline( [stage1, stage2, stage3], 3, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, device_mapping=list(range(4)), pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential) # Not enough values: with self.assertRaisesRegex(ValueError, 'Each stage must be mapped to an IPU'): pipelining_ops.pipeline( [stage1, stage2, stage3], 3, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, device_mapping=tuple(range(1)), pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)
def testPipelineInvalidDeviceMapping(self): dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): a = value b = (value + 10.) / 2.0 return {"a": a, "b": b} dataset = dataset.map(dataset_parser) infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed3") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed3") def stage1(c, **kwargs): with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv1')(kwargs["a"]) return y + kwargs["b"], c def stage2(x, c): return math_ops.reduce_sum(x) + c def stage3(x): return x with ops.device('cpu'): c = array_ops.placeholder(np.float32, shape=[]) # Wrong type: with self.assertRaisesRegex( NotImplementedError, 'When using batch serialization, all the pipeline ' 'stages need to be mapped to a single IPU.'): pipelining_ops.pipeline( [stage1, stage2, stage3], 3, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, device_mapping=[0, 1, 0], pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential, batch_serialization_iterations=4) # Wrong type: with self.assertRaisesRegex( NotImplementedError, 'Batch serialization is only supported with the ' '`Sequential` schedule'): pipelining_ops.pipeline( [stage1, stage2, stage3], 3, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, device_mapping=[0, 0, 0], pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped, batch_serialization_iterations=4)
def my_net(x): return pipelining_ops.pipeline( [stage1], 10, inputs=[x], pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped)