Ejemplo n.º 1
0
def training_loop(n_steps=50, cutoff=0.05, output_dir="./model/"):
    train_gen, eval_gen, vocab_size = generate_data(cutoff)

    lr_schedule = trax.lr.warmup_and_rsqrt_decay(n_warmup_steps=1000,
                                                 max_value=0.01)

    train_task = training.TrainTask(
        # labeled data
        labeled_data=train_gen,
        # loss layer
        loss_layer=tl.CrossEntropyLoss(),
        # optimizer
        optimizer=trax.optimizers.Adam(0.01),
        # lr_schedule
        lr_schedule=lr_schedule,
        # n_steps
        n_steps_per_checkpoint=n_steps)

    eval_task = training.EvalTask(
        # labeled data
        labeled_data=eval_gen,
        # metrics
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()])

    loop = training.Loop(ReformerLM(vocab_size, 6, mode='train'),
                         train_task,
                         eval_tasks=[eval_task],
                         output_dir=output_dir)

    return loop
Ejemplo n.º 2
0
    def test_train_mnist(self):
        """Train MNIST model (almost) fully, to compare to other implementations.

    Evals for cross-entropy loss and accuracy are run every 50 steps;
    their values are visible in the test log.
    """
        mnist_model = tl.Serial(
            tl.Flatten(),
            tl.Dense(512),
            tl.Relu(),
            tl.Dense(512),
            tl.Relu(),
            tl.Dense(10),
            tl.LogSoftmax(),
        )
        task = training.TrainTask(
            itertools.cycle(_mnist_dataset().train_stream(1)),
            tl.CrossEntropyLoss(), adafactor.Adafactor(.02))
        eval_task = training.EvalTask(
            itertools.cycle(_mnist_dataset().eval_stream(1)),
            [tl.CrossEntropyLoss(), tl.Accuracy()],
            n_eval_batches=10)

        training_session = training.Loop(
            mnist_model, [task],
            eval_tasks=[eval_task],
            eval_at=lambda step_n: step_n % 50 == 0)

        training_session.run(n_steps=1000)
        self.assertEqual(training_session.step, 1000)
Ejemplo n.º 3
0
    def test_train_mnist(self):
        """Train MNIST model (almost) fully, to compare to other implementations.

    Evals for cross-entropy loss and accuracy are run every 50 steps;
    their values are visible in the test log.
    """
        gin.parse_config([
            'batch_fn.batch_size_per_device = 256',
            'batch_fn.eval_batch_size = 256',
        ])

        mnist_model = tl.Serial(
            tl.Flatten(),
            tl.Dense(512),
            tl.Relu(),
            tl.Dense(512),
            tl.Relu(),
            tl.Dense(10),
            tl.LogSoftmax(),
        )
        task = training.TrainTask(
            itertools.cycle(_mnist_dataset().train_stream(1)),
            tl.CrossEntropyLoss(), adafactor.Adafactor(.02))
        eval_task = training.EvalTask(
            itertools.cycle(_mnist_dataset().eval_stream(1)),
            [tl.CrossEntropyLoss(), tl.AccuracyScalar()],
            names=['CrossEntropyLoss', 'AccuracyScalar'],
            eval_at=lambda step_n: step_n % 50 == 0,
            eval_N=10)

        training_session = training.Loop(mnist_model,
                                         task,
                                         eval_task=eval_task)
        training_session.run(n_steps=1000)
        self.assertEqual(training_session.current_step(), 1000)
Ejemplo n.º 4
0
def training_loop(TransformerLM, train_gen, eval_gen, output_dir="./model"):
    output_dir = os.path.expanduser(output_dir)
    lr_schedule = trax.lr.warmup_and_rsqrt_decay(n_warmup_steps=1000,
                                                 max_value=0.01)

    # This sets up loss function and our adam optimizer used to fit the data efficiently
    train_task = training.TrainTask(labeled_data=train_gen,
                                    loss_layer=tl.CrossEntropyLoss(),
                                    optimizer=trax.optimizers.Adam(0.01),
                                    lr_schedule=lr_schedule,
                                    n_steps_per_checkpoint=10)
    # We evaluate on a different dataset to ensure no overfitting
    eval_task = training.EvalTask(
        labeled_data=eval_gen, metrics=[tl.CrossEntropyLoss(),
                                        tl.Accuracy()])

    loop = training.Loop(TransformerLM(d_model=512,
                                       d_ff=2048,
                                       n_layers=6,
                                       n_heads=8,
                                       mode='train'),
                         train_task,
                         eval_tasks=[eval_task],
                         output_dir=output_dir)

    return loop
Ejemplo n.º 5
0
def _mnist_tasks():
    task = training.TrainTask(
        itertools.cycle(_mnist_dataset().train_stream(1)),
        tl.CrossEntropyLoss(),
        adam.Adam(0.001),
    )
    eval_task = training.EvalTask(
        itertools.cycle(_mnist_dataset().eval_stream(1)),
        (tl.CrossEntropyLoss(), tl.Accuracy()),
        n_eval_batches=10,
        metric_names=('CrossEntropy', 'Accuracy'),
    )
    return (task, eval_task)
Ejemplo n.º 6
0
 def test_mnist(self) -> None:
     trainer = TraxTrainer()
     trainer.load_data('mnist', tfds_dir=TestMnist.tfds_dir)
     trainer.load_model(get_model, False, num_classes=10)
     training_session = trainer.train(
         epochs=self.epochs,
         model_dir=TestMnist.model_dir,
         metric_emit_freq=lambda step_n: step_n % 50 == 0,
         metrics=[tl.CrossEntropyLoss(),
                  tl.Accuracy()],
         loss=tl.CrossEntropyLoss(),
         optimizer=adafactor.Adafactor(.02),
         callbacks=None,
         save_directory=None)
     self.assertEqual(training_session.current_step, self.epochs)
Ejemplo n.º 7
0
    def test_reset_twice(self, backend):
        if xla_bridge.device_count() > 1 and backend == fastmath.Backend.TFNP:
            self.skipTest(
                "tf-numpy backend doesn't support multi-devices yet.")
        with fastmath.use_backend(backend):
            n_classes = 4
            model_fn = functools.partial(models.MLP,
                                         d_hidden=16,
                                         n_output_classes=n_classes)
            inputs = _test_inputs(n_classes)

            trainer = trainer_lib.Trainer(
                model=model_fn,
                loss_fn=tl.CrossEntropyLoss(),
                optimizer=trax_opt.SM3,
                lr_schedule=lr.multifactor(),
                inputs=inputs,
            )

            output_dir1 = self.create_tempdir(name='output_dir1').full_path
            trainer.reset(output_dir1)
            trainer.evaluate(1)
            output_dir2 = self.create_tempdir(name='output_dir2').full_path
            trainer.reset(output_dir2)
            trainer.evaluate(1)
Ejemplo n.º 8
0
  def test_no_int32_or_uint32_returned(self):
    """Tests that Trainer._jit_update_fn doesn't return int32 or uint32.

    TF pins int32/uint32 tensors to CPU, which will cause XLA-forced-compiled
    computation to copy int32/uint32 outputs to CPU. This test makes sure that
    won't happen.
    """
    if xla_bridge.device_count() > 1:
      self.skipTest("tf-numpy backend doesn't support multi-devices yet.")
    with fastmath.use_backend(fastmath.Backend.TFNP), \
          self.tmp_dir() as output_dir:
      n_classes = 1001
      model_fn = functools.partial(models.Resnet50,
                                   n_output_classes=n_classes)
      inputs = _test_inputs(n_classes, input_shape=(224, 224, 3))
      trainer = trainer_lib.Trainer(
          model=model_fn,
          loss_fn=tl.CrossEntropyLoss(),
          optimizer=trax_opt.SM3,
          lr_schedule=lr.multifactor(),
          inputs=inputs,
      )
      trainer.reset(output_dir)
      trainer.train_epoch(1, 0)
      # Those are the things returned by Trainer._jit_update_fn
      arrays = (trainer._opt_state.weights, trainer._opt_state.slots,
                trainer._model_state, trainer._rngs)
      arrays = tf.nest.flatten(arrays)
      for x in arrays:
        if isinstance(x, jnp.ndarray) and (x.dtype == jnp.int32 or
                                           x.dtype == jnp.uint32):
          raise ValueError('Found an array of int32 or uint32: %s' % x)
Ejemplo n.º 9
0
def _mnist_tasks(head=None):
    """Creates MNIST training and evaluation tasks.

  Args:
    head: Adaptor layer to put before loss and accuracy layers in the tasks.

  Returns:
    A pair (train_task, eval_task) consisting of the MNIST training task and the
    MNIST evaluation task using cross-entropy as loss and accuracy as metric.
  """
    loss = tl.CrossEntropyLoss()
    accuracy = tl.Accuracy()
    if head is not None:
        loss = tl.Serial(head, loss)
        accuracy = tl.Serial(head, accuracy)
    task = training.TrainTask(
        itertools.cycle(_mnist_dataset().train_stream(1)),
        loss,
        adam.Adam(0.001),
    )
    eval_task = training.EvalTask(
        itertools.cycle(_mnist_dataset().eval_stream(1)),
        [loss, accuracy],
        n_eval_batches=10,
        metric_names=['CrossEntropy', 'Accuracy'],
    )
    return (task, eval_task)
Ejemplo n.º 10
0
def train_model(model, data_generator, batch_size=32, max_length=64, lines=lines, eval_lines=eval_lines, n_steps=1, output_dir='model/'): 
    """Function that trains the model

    Args:
        model (trax.layers.combinators.Serial): GRU model.
        data_generator (function): Data generator function.
        batch_size (int, optional): Number of lines per batch. Defaults to 32.
        max_length (int, optional): Maximum length allowed for a line to be processed. Defaults to 64.
        lines (list, optional): List of lines to use for training. Defaults to lines.
        eval_lines (list, optional): List of lines to use for evaluation. Defaults to eval_lines.
        n_steps (int, optional): Number of steps to train. Defaults to 1.
        output_dir (str, optional): Relative path of directory to save model. Defaults to "model/".

    Returns:
        trax.supervised.training.Loop: Training loop for the model.
    """
    
    ### START CODE HERE (Replace instances of 'None' with your code) ###
    bare_train_generator = data_generator(batch_size=batch_size, max_length=max_length, data_lines=lines)
    infinite_train_generator = itertools.cycle(bare_train_generator)
    
    bare_eval_generator = data_generator(batch_size=batch_size, max_length=max_length, data_lines=eval_lines)
    infinite_eval_generator = itertools.cycle(bare_eval_generator)
   
    train_task = training.TrainTask(
        labeled_data=infinite_train_generator, # Use infinite train data generator
        loss_layer=tl.CrossEntropyLoss(),   # Don't forget to instantiate this object
        optimizer=trax.optimizers.Adam(learning_rate=0.0005)     # Don't forget to add the learning rate parameter
    )

    eval_task = training.EvalTask(
        labeled_data=infinite_eval_generator,    # Use infinite eval data generator
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()], # Don't forget to instantiate these objects
        n_eval_batches=3      # For better evaluation accuracy in reasonable time
    )
    
    training_loop = training.Loop(model,
                                  train_task,
                                  eval_task=eval_task,
                                  output_dir=output_dir)

    training_loop.run(n_steps=n_steps)
    
    ### END CODE HERE ###
    
    # We return this because it contains a handle to the model, which has the weights etc.
    return training_loop
Ejemplo n.º 11
0
def set_model(model, train_stream, eval_stream, output_dir):
    train_task = training.TrainTask(labeled_data=train_stream,
                                    loss_layer=tl.CrossEntropyLoss(),
                                    optimizer=trax.optimizers.Adam(.01),
                                    lr_schedule=trax.lr.warmup_and_rsqrt_decay(
                                        1000, .01),
                                    n_steps_per_checkpoint=10)

    eval_task = training.EvalTask(
        labeled_data=eval_stream,
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()])

    training_loop = training.Loop(model,
                                  train_task,
                                  eval_tasks=[eval_task],
                                  output_dir=output_dir)
    return training_loop
def training_loop(ReformerLM, train_gen, eval_gen, output_dir="./model/"):
    """
    Args:
        ReformerLM:  the Reformer language model you are building
        train_gen (generator): train data generator.
        eval_gen (generator): Validation generator. 
        output_dir (string): Path to save the model output. Defaults to './model/'.

    Returns:
        trax.supervised.training.Loop: Training loop for the model.
    """

    # use the warmup_and_rsqrt_decay learning rate schedule
    lr_schedule = trax.lr.warmup_and_rsqrt_decay(n_warmup_steps=1000,
                                                 max_value=0.01)

    ### START CODE HERE (REPLACE INSTANCES OF 'None' WITH YOUR CODE) ###

    # define the train task
    train_task = training.TrainTask(
        # labeled data
        labeled_data=train_gen,  ##None,
        # loss layer
        loss_layer=tl.CrossEntropyLoss(),  ##None,
        # optimizer
        optimizer=trax.optimizers.Adam(0.01),  ##None,
        # lr_schedule
        lr_schedule=lr_schedule,  ##None,
        # n_steps
        n_steps_per_checkpoint=10  ##None
    )

    # define the eval task
    eval_task = training.EvalTask(
        # labeled data
        labeled_data=eval_gen,  ##None,
        # metrics
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()]  ##None
    )

    ### END CODE HERE ###
    loop = training.Loop(ReformerLM(mode='train'),
                         train_task,
                         eval_tasks=[eval_task],
                         output_dir=output_dir)
    return loop
Ejemplo n.º 13
0
 def test_cross_entropy_loss(self):
   # TODO(jonni): Clarify desired semantics/naming, then test it.
   layer = tl.CrossEntropyLoss()
   xs = [np.ones((9, 4, 4, 20)),
         np.ones((9, 4, 4)),
         np.ones((9, 4, 4))]
   y = layer(xs)
   self.assertEqual(y.shape, ())
Ejemplo n.º 14
0
    def test_run_reversible_same_as_default_extended(self):
        """Runs the reversible trainer, check results are the same as default."""
        inputs_batch = np.arange(8).reshape((2, 4))
        targets_batch = 2 * inputs_batch
        labeled_batch = (inputs_batch, targets_batch,
                         np.ones_like(targets_batch))
        # We want to test rng propagation too, so adding some dropout layers.
        first_layer = tl.Serial(tl.Embedding(9, 4), tl.Dropout(0.5), tl.Dup())
        rev_layers1 = [
            tl.ReversibleHalfResidual(tl.Dense(4), tl.Dropout(0.2)),
            tl.ReversibleSwap(),
            tl.ReversibleHalfResidual(tl.Dropout(0.5), tl.Dense(4)),
            tl.ReversibleSwap()
        ]
        mid_layer = tl.Serial(tl.Add(), tl.Dense(4), tl.Dup())
        rev_layers2 = [
            tl.ReversibleHalfResidual(tl.Dense(4), tl.Dropout(0.3)),
            tl.ReversibleSwap()
        ]
        loss_layer = tl.Serial(tl.Concatenate(), tl.Dense(19), tl.Dropout(0.3),
                               tl.LogSoftmax(), tl.CrossEntropyLoss())
        model = tl.Serial([first_layer] + rev_layers1 + [mid_layer] +
                          rev_layers2 + [loss_layer])
        rng_init = fastmath.random.get_prng(12)
        model.init(labeled_batch, rng=rng_init)
        optimizer_fn = optimizers.Adam  # to test slots

        # Make 3 steps with the original trainer.
        optimizer = optimizer_fn()
        optimizer.tree_init(model.weights)
        trainer = optimizers.Trainer(model, optimizer)
        rng_step1 = fastmath.random.get_prng(7)
        rng_step2 = fastmath.random.get_prng(8)
        rng_step3 = fastmath.random.get_prng(9)
        trainer.one_step(labeled_batch, rng_step1)
        trainer.one_step(labeled_batch, rng_step2, learning_rate=0.02)
        trainer.one_step(labeled_batch, rng_step3, learning_rate=0.03)
        first_layer_weights1 = first_layer.weights
        rev_layer12_weights1 = rev_layers1[2].weights
        mid_layer_weights1 = mid_layer.weights
        rev_layer20_weights1 = rev_layers2[0].weights
        loss_layer_weights1 = loss_layer.weights

        # Now make 3 steps with reversible trainer.
        model.init(labeled_batch, rng=rng_init)
        trainer = optimizers.ReversibleSerialTrainer(
            [(first_layer.sublayers, rev_layers1),
             (mid_layer.sublayers, rev_layers2)], loss_layer, optimizer_fn)
        trainer.one_step(labeled_batch, rng_step1)
        trainer.one_step(labeled_batch, rng_step2, learning_rate=0.02)
        trainer.one_step(labeled_batch, rng_step3, learning_rate=0.03)

        # Check that weights end up the same.
        self._assert_all_equal(loss_layer_weights1, loss_layer.weights)
        self._assert_all_equal(rev_layer20_weights1, rev_layers2[0].weights)
        self._assert_all_equal(mid_layer_weights1, mid_layer.weights)
        self._assert_all_equal(rev_layer12_weights1, rev_layers1[2].weights)
        self._assert_all_equal(first_layer_weights1, first_layer.weights)
Ejemplo n.º 15
0
def training_loop(TransformerLM,
                  train_gen,
                  eval_gen,
                  output_dir="~/model",
                  d_model=512,
                  d_ff=2048,
                  n_layers=6,
                  n_heads=8):
    """
    Input:
ls        TransformerLM (trax.layers.combinators.Serial): The model you are building.
        train_gen (generator): Training stream of data.
        eval_gen (generator): Evaluation stream of data.
        output_dir (str): folder to save your file.

    Returns:
        trax.supervised.training.Loop: Training loop.
    """
    output_dir = os.path.expanduser(output_dir)  # trainer is an object
    lr_schedule = trax.lr.warmup_and_rsqrt_decay(n_warmup_steps=1000,
                                                 max_value=0.01)

    train_task = training.TrainTask(
        labeled_data=train_gen,
        loss_layer=tl.CrossEntropyLoss(),  # Loss function
        optimizer=trax.optimizers.Adam(
            0.01),  # Optimizer (Don't forget to set LR to 0.01)
        lr_schedule=lr_schedule,
        n_steps_per_checkpoint=10)

    eval_task = training.EvalTask(
        labeled_data=eval_gen,  # The evaluation generator
        metrics=[tl.CrossEntropyLoss(),
                 tl.Accuracy()]  # CrossEntropyLoss and Accuracy
    )

    loop = training.Loop(TransformerLM(d_model=d_model,
                                       d_ff=d_ff,
                                       n_layers=n_layers,
                                       n_heads=n_heads,
                                       mode='train'),
                         train_task,
                         eval_tasks=[eval_task],
                         output_dir=output_dir)
    return loop
Ejemplo n.º 16
0
    def test_train_memory_efficient(self):
        """Trains a large network in a memory-efficient way."""
        # This test requires > 16GB RAM, only run on TPUs. It does pass on GPU
        # and CPU when you run it locally, but it's too big for unit-testing.
        ram_limited = True  # Set to False to run this test locally.
        if fastmath.device_count() == 1 and ram_limited:
            return

        # Create the model.
        n_layers = 16  # 16 layers each 16K x 16K = 256M weights ~= 1GB, 16GB ram
        model = tl.Serial(
            tl.Embedding(9, 16 * 1024),
            tl.Dup(),
            [[
                tl.ReversibleHalfResidual(tl.Dense(16 * 1024)),
                tl.ReversibleSwap()
            ] for _ in range(n_layers)],
            tl.Concatenate(),
            tl.Dense(9),
        )

        # Create inputs.
        inputs_batch = np.arange(8).reshape((2, 4))
        targets_batch = inputs_batch
        labeled_batch = (inputs_batch, targets_batch,
                         np.ones_like(targets_batch))

        def _data_gen():
            while True:
                yield labeled_batch

        # Run training.
        cross_entropy_loss = tl.Serial(tl.LogSoftmax(), tl.CrossEntropyLoss())
        task = training.TrainTask(_data_gen(), cross_entropy_loss,
                                  optimizers.Adafactor)
        eval_task = training.EvalTask(_data_gen(), [tl.CrossEntropyLoss()])
        loop = training.Loop(model, [task],
                             eval_tasks=[eval_task],
                             eval_at=lambda step_n: step_n == 2,
                             use_memory_efficient_trainer=True)
        self.assertEqual(0, loop.step)
        loop.run(n_steps=2)
        self.assertEqual(2, loop.step)
Ejemplo n.º 17
0
    def test_run_reversible_same_as_default_terraformer(self):
        """Runs the reversible trainer, check results are the same as default."""
        inputs_batch = np.arange(8).reshape((2, 4)) + 1
        targets_batch = 2 * inputs_batch
        labeled_batch = (inputs_batch, targets_batch,
                         np.ones_like(targets_batch))
        int_sig = shapes.ShapeDtype((2, 4), dtype=np.int32)
        input_sig = (int_sig, int_sig, int_sig)
        # We want to test rng propagation too, so adding some dropout layers.
        model = terraformer.ConfigurableTerraformer(20,
                                                    d_model=8,
                                                    d_ff=32,
                                                    n_heads=1,
                                                    dropout=0.0,
                                                    n_encoder_layers=2,
                                                    n_decoder_layers=2,
                                                    ff_sparsity=(4, 8, 0.0,
                                                                 1.0),
                                                    pos_type=None,
                                                    reversible_encoder=True)
        loss = tl.Serial(tl.LogSoftmax(), tl.CrossEntropyLoss())
        optimizer_fn = optimizers.Adafactor
        blocks, loss_layer = optimizers.trainer.extract_reversible_blocks(
            [model, loss], loss_chunk_size=4)
        blocks_serial = [(tl.Serial(std), rev) for (std, rev) in blocks]
        model_with_loss = tl.Serial(model, loss)
        rng_init = fastmath.random.get_prng(12)
        model_with_loss.init(input_sig, rng=rng_init)

        # Make 3 steps with the original trainer.
        optimizer = optimizer_fn()
        optimizer.tree_init(model_with_loss.weights)
        trainer = optimizers.Trainer(model_with_loss, optimizer)
        rng_step1 = fastmath.random.get_prng(7)
        rng_step2 = fastmath.random.get_prng(8)
        rng_step3 = fastmath.random.get_prng(9)
        trainer.one_step(labeled_batch, rng_step1)
        trainer.one_step(labeled_batch, rng_step2, learning_rate=0.02)
        trainer.one_step(labeled_batch, rng_step3, learning_rate=0.03)
        first_weights = blocks_serial[0][0].weights
        first_rev_weights = blocks[0][1][0].weights
        loss_weights = loss_layer.weights

        # Now make 3 steps with reversible trainer.
        model_with_loss.init(input_sig, rng=rng_init)
        trainer = optimizers.ReversibleSerialTrainer(blocks, loss_layer,
                                                     optimizer_fn)
        trainer.one_step(labeled_batch, rng_step1)
        trainer.one_step(labeled_batch, rng_step2, learning_rate=0.02)
        trainer.one_step(labeled_batch, rng_step3, learning_rate=0.03)

        # Check that weights end up the same.
        self._assert_all_equal(loss_weights, loss_layer.weights)
        self._assert_all_equal(first_rev_weights, blocks[0][1][0].weights)
        self._assert_all_equal(first_weights, blocks_serial[0][0].weights)
Ejemplo n.º 18
0
 def test_names(self):
     layer = tl.L2Loss()
     self.assertEqual('L2Loss_in3', str(layer))
     layer = tl.Accuracy()
     self.assertEqual('Accuracy_in3', str(layer))
     layer = tl.SequenceAccuracy()
     self.assertEqual('SequenceAccuracy_in3', str(layer))
     layer = tl.CrossEntropyLoss()
     self.assertEqual('CrossEntropyLoss_in3', str(layer))
     layer = tl.CrossEntropySum()
     self.assertEqual('CrossEntropySum_in3', str(layer))
Ejemplo n.º 19
0
 def test_run_reversible_slots(self):
   """Tests that slots can be read and assigned in reversible trainer."""
   layers = [tl.Dense(4), tl.Dup()]
   rev_layers = [tl.ReversibleHalfResidual(tl.Dense(4)),
                 tl.ReversibleSwap()]
   loss_layer = tl.Serial(tl.Concatenate(), tl.Dense(4),
                          tl.LogSoftmax(), tl.CrossEntropyLoss())
   trainer = optimizers.ReversibleSerialTrainer(
       [(layers, rev_layers)], loss_layer, optimizers.Adam)
   slots = trainer.slots
   trainer.slots = slots
   self.assertEqual(slots, trainer.slots)
Ejemplo n.º 20
0
def train_model(NER,
                train_generator,
                eval_generator,
                train_steps=1,
                output_dir='model'):
    '''
    Input: 
        NER - the model you are building
        train_generator - The data generator for training examples
        eval_generator - The data generator for validation examples,
        train_steps - number of training steps
        output_dir - folder to save your model
    Output:
        training_loop - a trax supervised training Loop
    '''
    ### START CODE HERE (Replace instances of 'None' with your code) ###
    train_task = training.TrainTask(
        train_generator,  # A train data generator
        loss_layer=tl.CrossEntropyLoss(),  # A cross-entropy loss function
        optimizer=trax.optimizers.Adam(0.01),  # The adam optimizer
    )

    eval_task = training.EvalTask(
        labeled_data=eval_generator,  # A labeled data generator
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()
                 ],  # Evaluate with cross-entropy loss and accuracy
        n_eval_batches=10  # Number of batches to use on each evaluation
    )

    training_loop = training.Loop(
        NER,  # A model to train
        train_task,  # A train task
        eval_task=eval_task,  # The evaluation task
        output_dir=output_dir)  # The output directory

    # Train with train_steps
    training_loop.run(n_steps=train_steps)
    ### END CODE HERE ###
    return training_loop
Ejemplo n.º 21
0
 def loss(id_to_mask=None, has_weights=False):
   """Cross-entropy loss as scalar compatible with Trax masking."""
   return layers.Serial(
       # Swap from (pred-obs, pred-reward, target-obs, target-reward)
       # to (pred-obs, target-obs, pred-reward, target-reward).
       layers.Parallel([], layers.Swap()),
       # Cross-entropy loss for obs, L2 loss on reward.
       layers.Parallel(layers.CrossEntropyLoss(id_to_mask, has_weights),
                       layers.L2Loss(id_to_mask, has_weights)),
       # Add both losses.
       layers.Add(),
       # Zero out in this test.
       layers.Fn(lambda x: x * 0.0),
   )
Ejemplo n.º 22
0
    def test_run_reversible_large_weights(self):
        """Runs the reversible trainer with a lot of weights to test memory use."""
        # This test requires > 18GB RAM, only run on TPUs. It does pass on GPU
        # and CPU when you run it locally, but it's too big for unit-testing.
        ram_limited = True  # Set to False to run this test locally.
        if fastmath.global_device_count() == 1 and ram_limited:
            return

        # Create inputs and rngs.
        inputs_batch = np.arange(8).reshape((2, 4))
        targets_batch = inputs_batch
        labeled_batch = (inputs_batch, targets_batch,
                         np.ones_like(targets_batch))
        first_layer = tl.Serial(tl.Embedding(9, 16 * 1024), tl.Dup())
        rng_init = fastmath.random.get_prng(12)
        rng_step = fastmath.random.get_prng(13)

        # Initialize layers.
        first_layer.init(labeled_batch, rng=rng_init)
        n_layers = 18  # 18 layers each 16K x 16K = 256M weights ~= 1GB, 18GB ram
        rev_layers = []
        int_shape = shapes.ShapeDtype((2, 4), dtype=np.int32)
        shape = shapes.ShapeDtype((2, 4, 16 * 1024))
        sig = (shape, shape)
        for _ in range(n_layers):
            layer = tl.ReversibleHalfResidual(tl.Dense(16 * 1024))
            layer.init(sig, rng=rng_init)
            layer.weights = tl.on_cpu(
                layer.weights)  # store weights in cpu memory
            rev_layers.append(layer)
            rev_layers.append(tl.ReversibleSwap())
        loss_layer = tl.Serial(tl.Concatenate(), tl.Dense(9), tl.LogSoftmax(),
                               tl.CrossEntropyLoss())
        loss_layer.init((shape, shape, int_shape, int_shape))
        optimizer_fn = optimizers.Adafactor

        # Make a step with reversible trainer.
        trainer = optimizers.ReversibleSerialTrainer(
            [(first_layer, rev_layers)], loss_layer, optimizer_fn)
        loss, _ = trainer.one_step(labeled_batch, rng_step)
        self.assertLess(float(loss.sum()), 10000.0)  # Just to get the loss.
        # Set to true to run again, e.g., for profiling.
        run_twice = False
        if run_twice:
            t = time.time()
            loss, _ = trainer.one_step(labeled_batch, rng_step)
            self.assertLess(float(loss.sum()),
                            10000.0)  # Just to get the loss.
            print('Took %.3f seconds to run, loss %s' %
                  (time.time() - t, loss))
Ejemplo n.º 23
0
 def loss():
     """Cross-entropy loss as scalar compatible with Trax masking."""
     ones = layers.Fn(lambda x: math.numpy.ones_like(x))  # pylint: disable=unnecessary-lambda
     return layers.Serial(
         # Swap from (pred-obs, pred-reward, target-obs, target-reward)
         # to (pred-obs, target-obs, pred-reward, target-reward).
         layers.Parallel([], layers.Swap()),
         # Duplicate target-obs and target-reward and make 1 to add weights.
         layers.Parallel([], layers.Branch([], ones)),
         layers.Parallel([], [], [], [], layers.Branch([], ones)),
         # Cross-entropy loss for obs, L2 loss on reward.
         layers.Parallel(layers.CrossEntropyLoss(), layers.L2Loss()),
         # Add both losses.
         layers.Add(),
         # Zero out in this test.
         layers.Fn(lambda x: x * 0.0),
     )
Ejemplo n.º 24
0
    def test_run_reversible_weights_trainsfer_xprof(self):
        """Runs the reversible trainer and profiles weight transfer stats."""
        run_this_test = False  # We only run this test manually.
        if not run_this_test or fastmath.global_device_count(
        ) == 1:  # TPU only
            return

        # Create inputs and rngs.
        inputs_batch = np.ones((1024, 128), dtype=np.int32)
        targets_batch = inputs_batch
        labeled_batch = (inputs_batch, targets_batch,
                         np.ones_like(targets_batch))
        first_layer = tl.Serial(tl.Embedding(4, 1024), tl.Dup())
        rng_init = fastmath.random.get_prng(12)
        rng_step = fastmath.random.get_prng(13)

        # Initialize layers.
        first_layer.init(labeled_batch, rng=rng_init)
        n_layers = 6
        rev_layers = []
        int_shape = shapes.ShapeDtype((1024, 128), dtype=np.int32)
        shape = shapes.ShapeDtype((1024, 128, 1024))
        sig = (shape, shape)
        for _ in range(n_layers):
            layer = tl.ReversibleHalfResidual(tl.Dense(1024))
            layer.init(sig, rng=rng_init)
            layer.weights = tl.on_cpu(
                layer.weights)  # store weights in cpu memory
            rev_layers.append(layer)
            rev_layers.append(tl.ReversibleSwap())
        loss_layer = tl.Serial(tl.Concatenate(), tl.Dense(9), tl.LogSoftmax(),
                               tl.CrossEntropyLoss())
        loss_layer.init((shape, shape, int_shape, int_shape))
        optimizer_fn = optimizers.SGD

        # Make a step with reversible trainer.
        trainer = optimizers.ReversibleSerialTrainer(
            [(first_layer, rev_layers)], loss_layer, optimizer_fn)
        loss, _ = trainer.one_step(labeled_batch, rng_step)
        self.assertLess(float(loss.sum()), 10000.0)  # Just to get the loss.
        # We profile here.
        t = time.time()
        loss, _ = trainer.one_step(labeled_batch, rng_step)
        self.assertLess(float(loss.sum()), 10000.0)  # Just to get the loss.
        print('Took %.3f seconds to run, loss %s' % (time.time() - t, loss))
Ejemplo n.º 25
0
 def test_call_and_grad(self):
     layer = tl.Serial(tl.Branch(tl.Embedding(3, 4), tl.PaddingMask()),
                       sparsity.Favor(d_feature=4, n_heads=2),
                       tl.Select([0], n_in=2), tl.LogSoftmax(),
                       tl.CrossEntropyLoss())
     x = np.ones((1, 2), dtype=np.int32)
     w = np.ones_like(x).astype(np.float32)
     x_sig = shapes.signature(x)
     w_sig = shapes.signature(w)
     layer.init((x_sig, x_sig, w_sig))
     y = layer((x, x, w))
     self.assertEqual(y.shape, ())
     state = layer.state
     rng = fastmath.random.get_prng(0)
     fwd = lambda weights, inp: layer.pure_fn(inp, weights, state, rng=rng)[
         0]
     g = fastmath.grad(fwd)(layer.weights, (x, x, w))
     self.assertEqual(g[0][1][0].shape, (3, 4))
Ejemplo n.º 26
0
 def test_names(self):
     layer = tl.L2Loss()
     self.assertEqual('L2Loss_in3', str(layer))
     layer = tl.BinaryClassifier()
     self.assertEqual('BinaryClassifier', str(layer))
     layer = tl.MulticlassClassifier()
     self.assertEqual('MulticlassClassifier', str(layer))
     layer = tl.Accuracy()
     self.assertEqual('Accuracy_in3', str(layer))
     layer = tl.SequenceAccuracy()
     self.assertEqual('SequenceAccuracy_in3', str(layer))
     layer = tl.BinaryCrossEntropyLoss()
     self.assertEqual('BinaryCrossEntropyLoss_in3', str(layer))
     layer = tl.CrossEntropyLoss()
     self.assertEqual('CrossEntropyLoss_in3', str(layer))
     layer = tl.BinaryCrossEntropySum()
     self.assertEqual('BinaryCrossEntropySum_in3', str(layer))
     layer = tl.CrossEntropySum()
     self.assertEqual('CrossEntropySum_in3', str(layer))
Ejemplo n.º 27
0
    def test_run_sharded_terraformer(self):
        """Runs Terraformer with sharded weights (only on 2+-device systems)."""
        if fastmath.local_device_count() == 1:
            return
        base.N_WEIGHTS_SHARDS = fastmath.local_device_count()
        inputs_batch = np.arange(8).reshape((2, 4)) + 1
        targets_batch = 2 * inputs_batch
        labeled_batch = (inputs_batch, targets_batch,
                         np.ones_like(targets_batch))
        int_sig = shapes.ShapeDtype((2, 4), dtype=np.int32)
        input_sig = (int_sig, int_sig, int_sig)
        # We want to test rng propagation too, so adding some dropout layers.
        model = terraformer.ConfigurableTerraformer(
            20,
            d_model=8,
            d_ff=32,
            n_heads=1,
            dropout=0.0,
            n_encoder_layers=2,
            n_decoder_layers=2,
            ff_sparsity=(4, 8, 0.0, 1.0),
            encoder_attention_type=tl.Attention,
            encoder_decoder_attention_type=tl.CausalAttention,
            pos_type=None,
            reversible_encoder=True)
        loss = tl.Serial(tl.LogSoftmax(), tl.CrossEntropyLoss())
        model_with_loss = tl.Serial(model, loss)
        rng_init = fastmath.random.get_prng(12)
        model_with_loss.init(input_sig, rng=rng_init)

        # Make a step with the trainer.
        optimizer = optimizers.Adafactor(0.01)
        split_w = fastmath.nested_map(
            lambda x: x[0],
            tl.shard(model_with_loss.weights, base.N_WEIGHTS_SHARDS))
        optimizer.tree_init(split_w)
        trainer = optimizers.Trainer(model_with_loss, optimizer)
        rng_step1 = fastmath.random.get_prng(7)
        trainer.one_step(labeled_batch, rng_step1)
        # Reset shards back to default.
        base.N_WEIGHTS_SHARDS = 1
Ejemplo n.º 28
0
    def test_reset_twice(self, backend):
        with fastmath.use_backend(backend):
            n_classes = 4
            model_fn = functools.partial(models.MLP,
                                         layer_widths=(16, 16, n_classes))
            inputs = _test_inputs(n_classes)

            trainer = trainer_lib.Trainer(
                model=model_fn,
                loss_fn=tl.Serial(tl.LogSoftmax(), tl.CrossEntropyLoss()),
                optimizer=trax_opt.SM3,
                lr_schedule=lr.multifactor(),
                inputs=inputs,
            )

            output_dir1 = self.create_tempdir(name='output_dir1').full_path
            trainer.reset(output_dir1)
            trainer.evaluate(1)
            output_dir2 = self.create_tempdir(name='output_dir2').full_path
            trainer.reset(output_dir2)
            trainer.evaluate(1)
Ejemplo n.º 29
0
  def test_reset_twice(self, backend_name):
    if xla_bridge.device_count() > 1 and backend_name == 'tf':
      self.skipTest("tf-numpy backend doesn't support multi-devices yet.")
    with math.use_backend(backend_name), self.tmp_dir() as output_dir1, \
          self.tmp_dir() as output_dir2:
      n_classes = 4
      model_fn = functools.partial(
          models.MLP, d_hidden=16, n_output_classes=n_classes)
      inputs = test_inputs(n_classes)

      trainer = trainer_lib.Trainer(
          model=model_fn,
          loss_fn=layers.CrossEntropyLoss(),
          optimizer=trax_opt.SM3,
          lr_schedule=lr.MultifactorSchedule,
          inputs=inputs,
      )

      trainer.reset(output_dir1)
      trainer.evaluate(1)
      trainer.reset(output_dir2)
      trainer.evaluate(1)
Ejemplo n.º 30
0
  def test_run_reversible_large_weights(self):
    """Runs the reversible trainer with a lot of weights to test memory use."""
    # This test requires > 20GB RAM, only run on TPUs. It does pass on GPU
    # and CPU when you run it locally, but it's too big for unit-testing.
    ram_limited = True  # Set to False to run this test locally.
    if fastmath.device_count() == 1 and ram_limited:
      return

    # Create inputs and rngs.
    inputs_batch = np.arange(8).reshape((2, 4))
    targets_batch = inputs_batch
    labeled_batch = (inputs_batch, targets_batch, np.ones_like(targets_batch))
    first_layer = tl.Serial(tl.Embedding(9, 16*1024), tl.Dup())
    rng_init = fastmath.random.get_prng(12)
    rng_step = fastmath.random.get_prng(13)

    # Initialize layers.
    first_layer.init(labeled_batch, rng=rng_init)
    n_layers = 20  # 20 layers each 16K x 16K = 256M weights ~= 1GB, 20GB ram
    rev_layers = []
    int_shape = shapes.ShapeDtype((2, 4), dtype=np.int32)
    shape = shapes.ShapeDtype((2, 4, 16*1024))
    sig = (shape, shape)
    for _ in range(n_layers):
      layer = tl.ReversibleHalfResidual(tl.Dense(16*1024))
      layer.init(sig, rng=rng_init)
      layer.weights = tl.on_cpu(layer.weights)  # store weights in cpu memory
      rev_layers.append(layer)
      rev_layers.append(tl.ReversibleSwap())
    loss_layer = tl.Serial(tl.Concatenate(), tl.Dense(9),
                           tl.LogSoftmax(), tl.CrossEntropyLoss())
    loss_layer.init((shape, shape, int_shape, int_shape))
    optimizer_fn = optimizers.Adafactor

    # Make a step with reversible trainer.
    trainer = optimizers.ReversibleSerialTrainer(
        first_layer, rev_layers, loss_layer, optimizer_fn)
    trainer.one_step(labeled_batch, rng_step)