コード例 #1
0
    def bound_train_loop(lr):
        stages, optimizer_fn = make_stages(
            fc_layers,
            opts.droprate,
            opt_cls,
            opt_kws,
            training=True,
            disable_dense_grad=disable_dense_grad,
            iterations_per_dense_grad=train_batches_per_step,
            png_queue=png_queue)

        return pipelining_ops.pipeline(
            computational_stages=stages,
            gradient_accumulation_count=opts.gradient_accumulation_count,
            repeat_count=train_batches_per_step,
            inputs=[lr],
            device_mapping=[0, 0],
            infeed_queue=infeed_train_queue,
            outfeed_queue=outfeed_train_queue,
            optimizer_function=optimizer_fn,
            offload_weight_update_variables=False,
            outfeed_loss=False,
            pipeline_schedule=next(p for p in pipelining_ops.PipelineSchedule
                                   if opts.pipeline_schedule == p.name),
            name="Pipeline_Train")
コード例 #2
0
ファイル: train.py プロジェクト: graphcore/examples
    def model_func(self, model, opts, global_step_holder, infeed_queue,
                   outfeed_queue):
        computational_stages = self.build_pretrain_pipeline_stages(model, opts)

        options = [
            ipu.pipelining_ops.PipelineStageOptions(
                matmul_options={
                    "availableMemoryProportion": str(0.2),
                    "partialsType": "half"
                },
                convolution_options={"partialsType": "half"})
        ] * len(opts["train"]["device_mapping"])

        # we write this wrapper because self.optimizer_func has "self" as it's parameter
        # it will cause an error when cal ipu_compiler.compile
        def optimizer_wrapper(giou_loss, conf_loss, prob_loss, lr):
            return self.optimize_func(giou_loss, conf_loss, prob_loss, lr)

        pipeline_op = pipelining_ops.pipeline(
            computational_stages=computational_stages,
            gradient_accumulation_count=opts["train"]["pipeline_depth"],
            repeat_count=self.repeat_count,
            optimizer_function=optimizer_wrapper,
            inputs=[global_step_holder],
            forward_propagation_stages_poplar_options=options,
            backward_propagation_stages_poplar_options=options,
            infeed_queue=infeed_queue,
            outfeed_queue=outfeed_queue,
            offload_activations=False,
            offload_gradient_accumulation_buffers=False,
            offload_weight_update_variables=False,
            device_mapping=opts["train"]["device_mapping"],
            name="Pipeline")
        return pipeline_op
コード例 #3
0
 def model_pipeline(x, lr):
     return pipelining_ops.pipeline(
         [stage1, stage2, stage3],
         12,
         inputs=[x, lr],
         outfeed_queue=outfeed_queue,
         optimizer_function=optimizer_function)
コード例 #4
0
 def my_net(x):
   return pipelining_ops.pipeline(
       [stage1, stage2],
       10,
       inputs=[x],
       optimizer_function=optimizer_function,
       pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)
コード例 #5
0
 def model_pipeline(x, y):
   return pipelining_ops.pipeline(
       [stage1, stage2, stage3],
       12,
       inputs=[x, y],
       outfeed_queue=outfeed_queue,
       pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)
コード例 #6
0
 def model_pipeline(x):
     return pipelining_ops.pipeline(
         [stage1, stage2],
         10,
         inputs=[x],
         outfeed_queue=outfeed_queue,
         pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped)
コード例 #7
0
 def my_net(c):
   return pipelining_ops.pipeline(
       [stage1, stage2, stage3],
       12,
       inputs=[c],
       infeed_queue=infeed_queue,
       outfeed_queue=outfeed_queue,
       pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)
コード例 #8
0
 def my_net(c):
     return pipelining_ops.pipeline(
         [stage1, stage2, stage3],
         12,
         inputs=[c],
         infeed_queue=infeed_queue,
         outfeed_queue=outfeed_queue,
         device_mapping=device_mapping,
         pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped)
コード例 #9
0
 def my_net(*args):
     return pipelining_ops.pipeline(
         stages,
         gradient_accumulation_count,
         repeat_count=repeat_count,
         batch_serialization_iterations=
         batch_serialization_iterations,
         inputs=args,
         optimizer_function=optimizer_function,
         infeed_queue=infeed_queue,
         outfeed_queue=outfeed_queue,
         pipeline_schedule=schedule,
         device_mapping=device_mapping)
コード例 #10
0
    def bound_test_loop():
        stages, _ = make_stages(
            fc_layers,
            opts.droprate,
            opt_cls,
            opt_kws,
            training=False,
            disable_dense_grad=disable_dense_grad,
            iterations_per_dense_grad=test_batches_per_step,
            png_queue=png_queue)

        return pipelining_ops.pipeline(
            computational_stages=stages,
            gradient_accumulation_count=opts.gradient_accumulation_count,
            repeat_count=test_batches_per_step,
            inputs=tf.Variable(initial_value=0.0, name="dummy_lr"),
            device_mapping=[0, 0],
            infeed_queue=infeed_test_queue,
            outfeed_queue=outfeed_test_queue,
            optimizer_function=None,
            outfeed_loss=False,
            pipeline_schedule=next(p for p in pipelining_ops.PipelineSchedule
                                   if opts.pipeline_schedule == p.name),
            name="Pipeline_Validation")
コード例 #11
0
ファイル: train_sparse.py プロジェクト: WN1695173791/examples
def forward_pass(opts, transformer, iterations_per_step, is_training, outfeed,
                 dense_queue, infeed):
    def make_counter():
        with tf.variable_scope("counter",
                               reuse=tf.AUTO_REUSE,
                               use_resource=True):
            itr_counter = tf.get_variable("iterations", [],
                                          tf.int32,
                                          trainable=False)
            increment_counter = tf.assign_add(itr_counter, 1)
            mod_itrs = tf.math.floormod(increment_counter, iterations_per_step)
            last_itr = tf.equal(mod_itrs, 0, name="last_update_itr")

            # Add accumulation counter if pipelined
            if opts.pipeline:
                grad_counter = internal_ops.get_current_iteration_counter()
                last_grad_itr = tf.equal(grad_counter,
                                         opts.gradient_accumulation_count - 1,
                                         name="last_grad_itr")

                last_itr = tf.logical_and(last_itr,
                                          last_grad_itr,
                                          name="last_itr")

        return last_itr

    def make_src_mask(last_itr, source):
        with tf.variable_scope("transformer",
                               reuse=tf.AUTO_REUSE,
                               use_resource=True):
            transformer.compute_dense_grad = last_itr
            autoregressive_mask = tf.constant(
                np.triu(np.ones([S, S], dtype=np.bool), k=1))
            source_mask = autoregressive_mask
            source_mask = tf.cast(source_mask, opts.dtype) * -10000
        return source_mask

    def loss_and_metrics(logits, source):
        with tf.variable_scope("metrics",
                               reuse=tf.AUTO_REUSE,
                               use_resource=True):
            # Implement autoregressice loss through teacher forcing
            # The first few tokens have no hope of being correct
            # so we exclude the first "offset" tokens from the loss
            offset = opts.autoregression_offset
            logits = tf.cast(logits[:, offset:-1],
                             tf.float32)  # logits always full precision
            target = source[:, offset + 1:]
            predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)

            # Accuracy
            acc, acc_op = tf.metrics.accuracy(target,
                                              predictions,
                                              name="token_accuracy")

            # Unweighted cross-entropy for tracking progress
            nll_loss = tf.losses.sparse_softmax_cross_entropy(labels=target,
                                                              logits=logits)
            nll_loss = tf.reduce_mean(nll_loss)
            perplexity = tf.exp(nll_loss)

            # Training loss (weighted cross-entropy)
            # the weight of the loss on each token is normalized by the number of
            # that token appears in the sequence
            # For instance if there are 10 padding tokens, the loss from each will have a weight of 1/10
            nll_weights = tf.expand_dims(target, -1)
            nll_weights = tf.equal(nll_weights,
                                   tf.transpose(nll_weights, perm=[0, 2, 1]))
            nll_weights = tf.cast(nll_weights, tf.float32)
            nll_weights = 1.0 / tf.reduce_sum(nll_weights, -1)
            training_loss = tf.losses.sparse_softmax_cross_entropy(
                labels=target, logits=logits, weights=nll_weights)
            training_loss = tf.reduce_mean(training_loss)
        return {
            "training_loss": training_loss,
            "token_accuracy": acc,
            "acc_op": acc_op,
            "nll_loss": nll_loss,
            "perplexity": perplexity,
            "predictions": predictions,
            "target": target
        }

    def make_lr_schedule(global_step):
        with tf.variable_scope("training",
                               reuse=tf.AUTO_REUSE,
                               use_resource=True):
            # The learning rate schedule needs to be part of the graph so the lr can
            # change between different batchs within the same io step
            schedule = tf_utils.BertSchedule(opts, opts.dtype)
            lr = schedule(global_step)
        return lr

    def make_optimizer(lr, last_itr):
        with tf.variable_scope("training",
                               reuse=tf.AUTO_REUSE,
                               use_resource=True):
            optimizer_class, optimizer_kwargs = build_optimizer(
                opts.optimizer, opts.optimizer_arg)
            optimizer_class = optimizers.SparseOptimizer(optimizer_class)
            optimizer_class = global_step_update_opt.GlobalStepUpdateOptimizer(
                optimizer_class)
            if opts.loss_scale != 1:
                optimizer_class = scaling_opt.LossScalingOptimizer(
                    optimizer_class)
                optimizer_kwargs['loss_scale'] = opts.loss_scale
                optimizer_kwargs[
                    'unscale_grad_pre_acc'] = opts.unscale_grad_pre_acc
            if opts.grad_acculation_mode == 'Avg':
                optimizer_class = scaling_opt.GradScalingOptimizer(
                    optimizer_class)
                optimizer_kwargs[
                    'grad_scale'] = 1 / opts.gradient_accumulation_count
                optimizer_kwargs[
                    'scale_grad_pre_acc'] = opts.scale_grad_pre_acc
            if opts.grad_norm_clip:
                optimizer_class = grad_clip_opt.GradientClippingOptimizer(
                    optimizer_class)
                optimizer_kwargs['norm_clip_threshold'] = opts.grad_norm_clip
            if opts.slots_fp_type is not None and tf.as_dtype(
                    opts.slots_fp_type) != opts.dtype:
                optimizer_class = fp_slot_opt.SelectableSlotFPFormatOptimizer(
                    optimizer_class)
                optimizer_kwargs['slots_dtype'] = opts.slots_fp_type
                optimizer_kwargs[
                    'force_fp32_weight_update'] = opts.force_fp32_weight_update
            optimizer = optimizer_class(
                learning_rate=lr,
                **optimizer_kwargs,
                sparse_layers=transformer.sparse_layers.values(),
                dense_gradient_condition=enable_dense_grad and last_itr,
                prune_and_grow_outfeed=dense_queue)
        return optimizer

    def make_pipeline_opt(outputs):
        optimizer = make_optimizer(outputs["learning_rate"],
                                   outputs["last_itr"])
        return pipelining_ops.OptimizerFunctionOutput(optimizer,
                                                      outputs["training_loss"])

    def make_outfeed(lr, global_step, metrics, itr_counter):
        acc_op = metrics['acc_op']

        if is_training:
            with tf.control_dependencies([acc_op]):
                output_dict = {
                    **metrics, "learning_rate": lr,
                    "global_step": tf.cast(global_step, tf.int32),
                    "iteration_counter": itr_counter
                }
                output = outfeed.enqueue(output_dict)
        else:
            # At inference time stream back the loss and accuracy
            with tf.control_dependencies([acc_op]):
                output = outfeed.enqueue(metrics)
        return output

    # Batch size and sequence length
    S = transformer.source_sequence_length
    enable_dense_grad = opts.prune_ratio is not None and opts.prune_ratio > 0

    if not opts.pipeline:
        # This autoregressive model is self-labeling needs only 1 input
        source = infeed
        last_itr = make_counter()
        source_mask = make_src_mask(last_itr, source)
        # Build the encoder
        logits = transformer.language_model(
            source=source,
            source_mask=source_mask,
            add_projection_layer=True,
            last_itr=last_itr,
            enable_dense_grad=enable_dense_grad,
            sparse_embeddings=opts.sparse_embeddings)
        metrics = loss_and_metrics(logits, source)
        if is_training:
            global_step = tf.cast(tf.train.get_or_create_global_step(),
                                  tf.int32)
            lr = make_lr_schedule(global_step)
            optimizer = make_optimizer(lr, last_itr)
            train_op = optimizer.minimize(metrics['training_loss'],
                                          global_step=global_step)
        else:
            lr, global_step = None, None
            train_op = tf.no_op()

        with tf.control_dependencies([train_op]):
            with tf.variable_scope("counter",
                                   reuse=tf.AUTO_REUSE,
                                   use_resource=True):
                itr_counter = tf.get_variable("iterations", [],
                                              tf.int32,
                                              trainable=False)
            output = make_outfeed(lr, global_step, metrics, itr_counter)
        return output
    else:

        def first_stage(global_step, source, input_stage_func):
            last_itr = make_counter()
            source_mask = make_src_mask(last_itr, source)
            return input_stage_func(source, source_mask, last_itr, global_step)

        def last_stage(encoder_out, source_mask, *args, **kwargs):
            last_itr = args[0]
            global_step = args[1]
            source = args[2]
            output_stage_func = kwargs['output_stage_func']
            logits, *_ = output_stage_func(encoder_out, source_mask, *args)
            metrics = loss_and_metrics(logits, source)
            if is_training:
                metrics.update({
                    "learning_rate": make_lr_schedule(global_step),
                    "last_itr": last_itr,
                    "global_step": tf.convert_to_tensor(global_step)
                })
                return metrics
            else:
                metrics['last_itr'] = last_itr
                return metrics

        stages, device_mapping, stage_options = transformer.language_model_stages(
            enable_dense_grad=enable_dense_grad,
            sparse_embeddings=opts.sparse_embeddings)
        stages[0] = partial(first_stage, input_stage_func=stages[0])
        stages[-1] = partial(last_stage, output_stage_func=stages[-1])

        pipeline_op = pipelining_ops.pipeline(
            computational_stages=stages,
            gradient_accumulation_count=opts.gradient_accumulation_count,
            gradient_accumulation_dtype=opts.gradient_accumulation_dtype,
            repeat_count=iterations_per_step,
            inputs=[tf.cast(tf.train.get_or_create_global_step(), tf.int32)],
            infeed_queue=infeed,
            outfeed_queue=outfeed,
            optimizer_function=make_pipeline_opt if is_training else None,
            device_mapping=device_mapping,
            offload_activations=opts.offload_activations,
            offload_gradient_accumulation_buffers=opts.
            offload_gradient_accumulation_buffers,
            offload_weight_update_variables=opts.
            offload_weight_update_variables,
            forward_propagation_stages_poplar_options=stage_options,
            backward_propagation_stages_poplar_options=stage_options,
            name="Pipeline")

        return pipeline_op
コード例 #12
0
 def my_net(x):
   return pipelining_ops.pipeline(
       [stage1, stage2],
       10,
       inputs=[x],
       pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)
コード例 #13
0
  def testPipelineInvalidDeviceMapping(self):
    dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2])
    dataset = dataset.batch(batch_size=2, drop_remainder=True)

    def dataset_parser(value):
      a = value
      b = (value + 10.) / 2.0
      return {"a": a, "b": b}

    dataset = dataset.map(dataset_parser)
    infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed3")
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed3")

    def stage1(c, **kwargs):
      with variable_scope.variable_scope("vs", use_resource=True):
        y = layers.Conv2D(2,
                          1,
                          use_bias=True,
                          kernel_initializer=init_ops.ones_initializer(),
                          name='conv1')(kwargs["a"])
        return y + kwargs["b"], c

    def stage2(x, c):
      return math_ops.reduce_sum(x) + c

    def stage3(x):
      return x

    with ops.device('cpu'):
      c = array_ops.placeholder(np.float32, shape=[])

    # Wrong type:
    with self.assertRaisesRegex(
        TypeError, 'device_mapping argument needs to be a list or a tuple'):
      pipelining_ops.pipeline(
          [stage1, stage2, stage3],
          3,
          inputs=[c],
          infeed_queue=infeed_queue,
          outfeed_queue=outfeed_queue,
          device_mapping=1,
          pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)

    # Too many values:
    with self.assertRaisesRegex(ValueError,
                                'Each stage must be mapped to an IPU'):
      pipelining_ops.pipeline(
          [stage1, stage2, stage3],
          3,
          inputs=[c],
          infeed_queue=infeed_queue,
          outfeed_queue=outfeed_queue,
          device_mapping=list(range(4)),
          pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)

    # Not enough values:
    with self.assertRaisesRegex(ValueError,
                                'Each stage must be mapped to an IPU'):
      pipelining_ops.pipeline(
          [stage1, stage2, stage3],
          3,
          inputs=[c],
          infeed_queue=infeed_queue,
          outfeed_queue=outfeed_queue,
          device_mapping=tuple(range(1)),
          pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)
コード例 #14
0
    def testPipelineInvalidDeviceMapping(self):
        dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2])
        dataset = dataset.batch(batch_size=2, drop_remainder=True)

        def dataset_parser(value):
            a = value
            b = (value + 10.) / 2.0
            return {"a": a, "b": b}

        dataset = dataset.map(dataset_parser)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed3")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed3")

        def stage1(c, **kwargs):
            with variable_scope.variable_scope("vs", use_resource=True):
                y = layers.Conv2D(
                    2,
                    1,
                    use_bias=True,
                    kernel_initializer=init_ops.ones_initializer(),
                    name='conv1')(kwargs["a"])
                return y + kwargs["b"], c

        def stage2(x, c):
            return math_ops.reduce_sum(x) + c

        def stage3(x):
            return x

        with ops.device('cpu'):
            c = array_ops.placeholder(np.float32, shape=[])

        # Wrong type:
        with self.assertRaisesRegex(
                NotImplementedError,
                'When using batch serialization, all the pipeline '
                'stages need to be mapped to a single IPU.'):
            pipelining_ops.pipeline(
                [stage1, stage2, stage3],
                3,
                inputs=[c],
                infeed_queue=infeed_queue,
                outfeed_queue=outfeed_queue,
                device_mapping=[0, 1, 0],
                pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential,
                batch_serialization_iterations=4)

        # Wrong type:
        with self.assertRaisesRegex(
                NotImplementedError,
                'Batch serialization is only supported with the '
                '`Sequential` schedule'):
            pipelining_ops.pipeline(
                [stage1, stage2, stage3],
                3,
                inputs=[c],
                infeed_queue=infeed_queue,
                outfeed_queue=outfeed_queue,
                device_mapping=[0, 0, 0],
                pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped,
                batch_serialization_iterations=4)
コード例 #15
0
 def my_net(x):
     return pipelining_ops.pipeline(
         [stage1],
         10,
         inputs=[x],
         pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped)