def training_loop(n_steps=50, cutoff=0.05, output_dir="./model/"): train_gen, eval_gen, vocab_size = generate_data(cutoff) lr_schedule = trax.lr.warmup_and_rsqrt_decay(n_warmup_steps=1000, max_value=0.01) train_task = training.TrainTask( # labeled data labeled_data=train_gen, # loss layer loss_layer=tl.CrossEntropyLoss(), # optimizer optimizer=trax.optimizers.Adam(0.01), # lr_schedule lr_schedule=lr_schedule, # n_steps n_steps_per_checkpoint=n_steps) eval_task = training.EvalTask( # labeled data labeled_data=eval_gen, # metrics metrics=[tl.CrossEntropyLoss(), tl.Accuracy()]) loop = training.Loop(ReformerLM(vocab_size, 6, mode='train'), train_task, eval_tasks=[eval_task], output_dir=output_dir) return loop
def test_train_mnist(self): """Train MNIST model (almost) fully, to compare to other implementations. Evals for cross-entropy loss and accuracy are run every 50 steps; their values are visible in the test log. """ mnist_model = tl.Serial( tl.Flatten(), tl.Dense(512), tl.Relu(), tl.Dense(512), tl.Relu(), tl.Dense(10), tl.LogSoftmax(), ) task = training.TrainTask( itertools.cycle(_mnist_dataset().train_stream(1)), tl.CrossEntropyLoss(), adafactor.Adafactor(.02)) eval_task = training.EvalTask( itertools.cycle(_mnist_dataset().eval_stream(1)), [tl.CrossEntropyLoss(), tl.Accuracy()], n_eval_batches=10) training_session = training.Loop( mnist_model, [task], eval_tasks=[eval_task], eval_at=lambda step_n: step_n % 50 == 0) training_session.run(n_steps=1000) self.assertEqual(training_session.step, 1000)
def test_train_mnist(self): """Train MNIST model (almost) fully, to compare to other implementations. Evals for cross-entropy loss and accuracy are run every 50 steps; their values are visible in the test log. """ gin.parse_config([ 'batch_fn.batch_size_per_device = 256', 'batch_fn.eval_batch_size = 256', ]) mnist_model = tl.Serial( tl.Flatten(), tl.Dense(512), tl.Relu(), tl.Dense(512), tl.Relu(), tl.Dense(10), tl.LogSoftmax(), ) task = training.TrainTask( itertools.cycle(_mnist_dataset().train_stream(1)), tl.CrossEntropyLoss(), adafactor.Adafactor(.02)) eval_task = training.EvalTask( itertools.cycle(_mnist_dataset().eval_stream(1)), [tl.CrossEntropyLoss(), tl.AccuracyScalar()], names=['CrossEntropyLoss', 'AccuracyScalar'], eval_at=lambda step_n: step_n % 50 == 0, eval_N=10) training_session = training.Loop(mnist_model, task, eval_task=eval_task) training_session.run(n_steps=1000) self.assertEqual(training_session.current_step(), 1000)
def training_loop(TransformerLM, train_gen, eval_gen, output_dir="./model"): output_dir = os.path.expanduser(output_dir) lr_schedule = trax.lr.warmup_and_rsqrt_decay(n_warmup_steps=1000, max_value=0.01) # This sets up loss function and our adam optimizer used to fit the data efficiently train_task = training.TrainTask(labeled_data=train_gen, loss_layer=tl.CrossEntropyLoss(), optimizer=trax.optimizers.Adam(0.01), lr_schedule=lr_schedule, n_steps_per_checkpoint=10) # We evaluate on a different dataset to ensure no overfitting eval_task = training.EvalTask( labeled_data=eval_gen, metrics=[tl.CrossEntropyLoss(), tl.Accuracy()]) loop = training.Loop(TransformerLM(d_model=512, d_ff=2048, n_layers=6, n_heads=8, mode='train'), train_task, eval_tasks=[eval_task], output_dir=output_dir) return loop
def _mnist_tasks(): task = training.TrainTask( itertools.cycle(_mnist_dataset().train_stream(1)), tl.CrossEntropyLoss(), adam.Adam(0.001), ) eval_task = training.EvalTask( itertools.cycle(_mnist_dataset().eval_stream(1)), (tl.CrossEntropyLoss(), tl.Accuracy()), n_eval_batches=10, metric_names=('CrossEntropy', 'Accuracy'), ) return (task, eval_task)
def test_mnist(self) -> None: trainer = TraxTrainer() trainer.load_data('mnist', tfds_dir=TestMnist.tfds_dir) trainer.load_model(get_model, False, num_classes=10) training_session = trainer.train( epochs=self.epochs, model_dir=TestMnist.model_dir, metric_emit_freq=lambda step_n: step_n % 50 == 0, metrics=[tl.CrossEntropyLoss(), tl.Accuracy()], loss=tl.CrossEntropyLoss(), optimizer=adafactor.Adafactor(.02), callbacks=None, save_directory=None) self.assertEqual(training_session.current_step, self.epochs)
def test_reset_twice(self, backend): if xla_bridge.device_count() > 1 and backend == fastmath.Backend.TFNP: self.skipTest( "tf-numpy backend doesn't support multi-devices yet.") with fastmath.use_backend(backend): n_classes = 4 model_fn = functools.partial(models.MLP, d_hidden=16, n_output_classes=n_classes) inputs = _test_inputs(n_classes) trainer = trainer_lib.Trainer( model=model_fn, loss_fn=tl.CrossEntropyLoss(), optimizer=trax_opt.SM3, lr_schedule=lr.multifactor(), inputs=inputs, ) output_dir1 = self.create_tempdir(name='output_dir1').full_path trainer.reset(output_dir1) trainer.evaluate(1) output_dir2 = self.create_tempdir(name='output_dir2').full_path trainer.reset(output_dir2) trainer.evaluate(1)
def test_no_int32_or_uint32_returned(self): """Tests that Trainer._jit_update_fn doesn't return int32 or uint32. TF pins int32/uint32 tensors to CPU, which will cause XLA-forced-compiled computation to copy int32/uint32 outputs to CPU. This test makes sure that won't happen. """ if xla_bridge.device_count() > 1: self.skipTest("tf-numpy backend doesn't support multi-devices yet.") with fastmath.use_backend(fastmath.Backend.TFNP), \ self.tmp_dir() as output_dir: n_classes = 1001 model_fn = functools.partial(models.Resnet50, n_output_classes=n_classes) inputs = _test_inputs(n_classes, input_shape=(224, 224, 3)) trainer = trainer_lib.Trainer( model=model_fn, loss_fn=tl.CrossEntropyLoss(), optimizer=trax_opt.SM3, lr_schedule=lr.multifactor(), inputs=inputs, ) trainer.reset(output_dir) trainer.train_epoch(1, 0) # Those are the things returned by Trainer._jit_update_fn arrays = (trainer._opt_state.weights, trainer._opt_state.slots, trainer._model_state, trainer._rngs) arrays = tf.nest.flatten(arrays) for x in arrays: if isinstance(x, jnp.ndarray) and (x.dtype == jnp.int32 or x.dtype == jnp.uint32): raise ValueError('Found an array of int32 or uint32: %s' % x)
def _mnist_tasks(head=None): """Creates MNIST training and evaluation tasks. Args: head: Adaptor layer to put before loss and accuracy layers in the tasks. Returns: A pair (train_task, eval_task) consisting of the MNIST training task and the MNIST evaluation task using cross-entropy as loss and accuracy as metric. """ loss = tl.CrossEntropyLoss() accuracy = tl.Accuracy() if head is not None: loss = tl.Serial(head, loss) accuracy = tl.Serial(head, accuracy) task = training.TrainTask( itertools.cycle(_mnist_dataset().train_stream(1)), loss, adam.Adam(0.001), ) eval_task = training.EvalTask( itertools.cycle(_mnist_dataset().eval_stream(1)), [loss, accuracy], n_eval_batches=10, metric_names=['CrossEntropy', 'Accuracy'], ) return (task, eval_task)
def train_model(model, data_generator, batch_size=32, max_length=64, lines=lines, eval_lines=eval_lines, n_steps=1, output_dir='model/'): """Function that trains the model Args: model (trax.layers.combinators.Serial): GRU model. data_generator (function): Data generator function. batch_size (int, optional): Number of lines per batch. Defaults to 32. max_length (int, optional): Maximum length allowed for a line to be processed. Defaults to 64. lines (list, optional): List of lines to use for training. Defaults to lines. eval_lines (list, optional): List of lines to use for evaluation. Defaults to eval_lines. n_steps (int, optional): Number of steps to train. Defaults to 1. output_dir (str, optional): Relative path of directory to save model. Defaults to "model/". Returns: trax.supervised.training.Loop: Training loop for the model. """ ### START CODE HERE (Replace instances of 'None' with your code) ### bare_train_generator = data_generator(batch_size=batch_size, max_length=max_length, data_lines=lines) infinite_train_generator = itertools.cycle(bare_train_generator) bare_eval_generator = data_generator(batch_size=batch_size, max_length=max_length, data_lines=eval_lines) infinite_eval_generator = itertools.cycle(bare_eval_generator) train_task = training.TrainTask( labeled_data=infinite_train_generator, # Use infinite train data generator loss_layer=tl.CrossEntropyLoss(), # Don't forget to instantiate this object optimizer=trax.optimizers.Adam(learning_rate=0.0005) # Don't forget to add the learning rate parameter ) eval_task = training.EvalTask( labeled_data=infinite_eval_generator, # Use infinite eval data generator metrics=[tl.CrossEntropyLoss(), tl.Accuracy()], # Don't forget to instantiate these objects n_eval_batches=3 # For better evaluation accuracy in reasonable time ) training_loop = training.Loop(model, train_task, eval_task=eval_task, output_dir=output_dir) training_loop.run(n_steps=n_steps) ### END CODE HERE ### # We return this because it contains a handle to the model, which has the weights etc. return training_loop
def set_model(model, train_stream, eval_stream, output_dir): train_task = training.TrainTask(labeled_data=train_stream, loss_layer=tl.CrossEntropyLoss(), optimizer=trax.optimizers.Adam(.01), lr_schedule=trax.lr.warmup_and_rsqrt_decay( 1000, .01), n_steps_per_checkpoint=10) eval_task = training.EvalTask( labeled_data=eval_stream, metrics=[tl.CrossEntropyLoss(), tl.Accuracy()]) training_loop = training.Loop(model, train_task, eval_tasks=[eval_task], output_dir=output_dir) return training_loop
def training_loop(ReformerLM, train_gen, eval_gen, output_dir="./model/"): """ Args: ReformerLM: the Reformer language model you are building train_gen (generator): train data generator. eval_gen (generator): Validation generator. output_dir (string): Path to save the model output. Defaults to './model/'. Returns: trax.supervised.training.Loop: Training loop for the model. """ # use the warmup_and_rsqrt_decay learning rate schedule lr_schedule = trax.lr.warmup_and_rsqrt_decay(n_warmup_steps=1000, max_value=0.01) ### START CODE HERE (REPLACE INSTANCES OF 'None' WITH YOUR CODE) ### # define the train task train_task = training.TrainTask( # labeled data labeled_data=train_gen, ##None, # loss layer loss_layer=tl.CrossEntropyLoss(), ##None, # optimizer optimizer=trax.optimizers.Adam(0.01), ##None, # lr_schedule lr_schedule=lr_schedule, ##None, # n_steps n_steps_per_checkpoint=10 ##None ) # define the eval task eval_task = training.EvalTask( # labeled data labeled_data=eval_gen, ##None, # metrics metrics=[tl.CrossEntropyLoss(), tl.Accuracy()] ##None ) ### END CODE HERE ### loop = training.Loop(ReformerLM(mode='train'), train_task, eval_tasks=[eval_task], output_dir=output_dir) return loop
def test_cross_entropy_loss(self): # TODO(jonni): Clarify desired semantics/naming, then test it. layer = tl.CrossEntropyLoss() xs = [np.ones((9, 4, 4, 20)), np.ones((9, 4, 4)), np.ones((9, 4, 4))] y = layer(xs) self.assertEqual(y.shape, ())
def test_run_reversible_same_as_default_extended(self): """Runs the reversible trainer, check results are the same as default.""" inputs_batch = np.arange(8).reshape((2, 4)) targets_batch = 2 * inputs_batch labeled_batch = (inputs_batch, targets_batch, np.ones_like(targets_batch)) # We want to test rng propagation too, so adding some dropout layers. first_layer = tl.Serial(tl.Embedding(9, 4), tl.Dropout(0.5), tl.Dup()) rev_layers1 = [ tl.ReversibleHalfResidual(tl.Dense(4), tl.Dropout(0.2)), tl.ReversibleSwap(), tl.ReversibleHalfResidual(tl.Dropout(0.5), tl.Dense(4)), tl.ReversibleSwap() ] mid_layer = tl.Serial(tl.Add(), tl.Dense(4), tl.Dup()) rev_layers2 = [ tl.ReversibleHalfResidual(tl.Dense(4), tl.Dropout(0.3)), tl.ReversibleSwap() ] loss_layer = tl.Serial(tl.Concatenate(), tl.Dense(19), tl.Dropout(0.3), tl.LogSoftmax(), tl.CrossEntropyLoss()) model = tl.Serial([first_layer] + rev_layers1 + [mid_layer] + rev_layers2 + [loss_layer]) rng_init = fastmath.random.get_prng(12) model.init(labeled_batch, rng=rng_init) optimizer_fn = optimizers.Adam # to test slots # Make 3 steps with the original trainer. optimizer = optimizer_fn() optimizer.tree_init(model.weights) trainer = optimizers.Trainer(model, optimizer) rng_step1 = fastmath.random.get_prng(7) rng_step2 = fastmath.random.get_prng(8) rng_step3 = fastmath.random.get_prng(9) trainer.one_step(labeled_batch, rng_step1) trainer.one_step(labeled_batch, rng_step2, learning_rate=0.02) trainer.one_step(labeled_batch, rng_step3, learning_rate=0.03) first_layer_weights1 = first_layer.weights rev_layer12_weights1 = rev_layers1[2].weights mid_layer_weights1 = mid_layer.weights rev_layer20_weights1 = rev_layers2[0].weights loss_layer_weights1 = loss_layer.weights # Now make 3 steps with reversible trainer. model.init(labeled_batch, rng=rng_init) trainer = optimizers.ReversibleSerialTrainer( [(first_layer.sublayers, rev_layers1), (mid_layer.sublayers, rev_layers2)], loss_layer, optimizer_fn) trainer.one_step(labeled_batch, rng_step1) trainer.one_step(labeled_batch, rng_step2, learning_rate=0.02) trainer.one_step(labeled_batch, rng_step3, learning_rate=0.03) # Check that weights end up the same. self._assert_all_equal(loss_layer_weights1, loss_layer.weights) self._assert_all_equal(rev_layer20_weights1, rev_layers2[0].weights) self._assert_all_equal(mid_layer_weights1, mid_layer.weights) self._assert_all_equal(rev_layer12_weights1, rev_layers1[2].weights) self._assert_all_equal(first_layer_weights1, first_layer.weights)
def training_loop(TransformerLM, train_gen, eval_gen, output_dir="~/model", d_model=512, d_ff=2048, n_layers=6, n_heads=8): """ Input: ls TransformerLM (trax.layers.combinators.Serial): The model you are building. train_gen (generator): Training stream of data. eval_gen (generator): Evaluation stream of data. output_dir (str): folder to save your file. Returns: trax.supervised.training.Loop: Training loop. """ output_dir = os.path.expanduser(output_dir) # trainer is an object lr_schedule = trax.lr.warmup_and_rsqrt_decay(n_warmup_steps=1000, max_value=0.01) train_task = training.TrainTask( labeled_data=train_gen, loss_layer=tl.CrossEntropyLoss(), # Loss function optimizer=trax.optimizers.Adam( 0.01), # Optimizer (Don't forget to set LR to 0.01) lr_schedule=lr_schedule, n_steps_per_checkpoint=10) eval_task = training.EvalTask( labeled_data=eval_gen, # The evaluation generator metrics=[tl.CrossEntropyLoss(), tl.Accuracy()] # CrossEntropyLoss and Accuracy ) loop = training.Loop(TransformerLM(d_model=d_model, d_ff=d_ff, n_layers=n_layers, n_heads=n_heads, mode='train'), train_task, eval_tasks=[eval_task], output_dir=output_dir) return loop
def test_train_memory_efficient(self): """Trains a large network in a memory-efficient way.""" # This test requires > 16GB RAM, only run on TPUs. It does pass on GPU # and CPU when you run it locally, but it's too big for unit-testing. ram_limited = True # Set to False to run this test locally. if fastmath.device_count() == 1 and ram_limited: return # Create the model. n_layers = 16 # 16 layers each 16K x 16K = 256M weights ~= 1GB, 16GB ram model = tl.Serial( tl.Embedding(9, 16 * 1024), tl.Dup(), [[ tl.ReversibleHalfResidual(tl.Dense(16 * 1024)), tl.ReversibleSwap() ] for _ in range(n_layers)], tl.Concatenate(), tl.Dense(9), ) # Create inputs. inputs_batch = np.arange(8).reshape((2, 4)) targets_batch = inputs_batch labeled_batch = (inputs_batch, targets_batch, np.ones_like(targets_batch)) def _data_gen(): while True: yield labeled_batch # Run training. cross_entropy_loss = tl.Serial(tl.LogSoftmax(), tl.CrossEntropyLoss()) task = training.TrainTask(_data_gen(), cross_entropy_loss, optimizers.Adafactor) eval_task = training.EvalTask(_data_gen(), [tl.CrossEntropyLoss()]) loop = training.Loop(model, [task], eval_tasks=[eval_task], eval_at=lambda step_n: step_n == 2, use_memory_efficient_trainer=True) self.assertEqual(0, loop.step) loop.run(n_steps=2) self.assertEqual(2, loop.step)
def test_run_reversible_same_as_default_terraformer(self): """Runs the reversible trainer, check results are the same as default.""" inputs_batch = np.arange(8).reshape((2, 4)) + 1 targets_batch = 2 * inputs_batch labeled_batch = (inputs_batch, targets_batch, np.ones_like(targets_batch)) int_sig = shapes.ShapeDtype((2, 4), dtype=np.int32) input_sig = (int_sig, int_sig, int_sig) # We want to test rng propagation too, so adding some dropout layers. model = terraformer.ConfigurableTerraformer(20, d_model=8, d_ff=32, n_heads=1, dropout=0.0, n_encoder_layers=2, n_decoder_layers=2, ff_sparsity=(4, 8, 0.0, 1.0), pos_type=None, reversible_encoder=True) loss = tl.Serial(tl.LogSoftmax(), tl.CrossEntropyLoss()) optimizer_fn = optimizers.Adafactor blocks, loss_layer = optimizers.trainer.extract_reversible_blocks( [model, loss], loss_chunk_size=4) blocks_serial = [(tl.Serial(std), rev) for (std, rev) in blocks] model_with_loss = tl.Serial(model, loss) rng_init = fastmath.random.get_prng(12) model_with_loss.init(input_sig, rng=rng_init) # Make 3 steps with the original trainer. optimizer = optimizer_fn() optimizer.tree_init(model_with_loss.weights) trainer = optimizers.Trainer(model_with_loss, optimizer) rng_step1 = fastmath.random.get_prng(7) rng_step2 = fastmath.random.get_prng(8) rng_step3 = fastmath.random.get_prng(9) trainer.one_step(labeled_batch, rng_step1) trainer.one_step(labeled_batch, rng_step2, learning_rate=0.02) trainer.one_step(labeled_batch, rng_step3, learning_rate=0.03) first_weights = blocks_serial[0][0].weights first_rev_weights = blocks[0][1][0].weights loss_weights = loss_layer.weights # Now make 3 steps with reversible trainer. model_with_loss.init(input_sig, rng=rng_init) trainer = optimizers.ReversibleSerialTrainer(blocks, loss_layer, optimizer_fn) trainer.one_step(labeled_batch, rng_step1) trainer.one_step(labeled_batch, rng_step2, learning_rate=0.02) trainer.one_step(labeled_batch, rng_step3, learning_rate=0.03) # Check that weights end up the same. self._assert_all_equal(loss_weights, loss_layer.weights) self._assert_all_equal(first_rev_weights, blocks[0][1][0].weights) self._assert_all_equal(first_weights, blocks_serial[0][0].weights)
def test_names(self): layer = tl.L2Loss() self.assertEqual('L2Loss_in3', str(layer)) layer = tl.Accuracy() self.assertEqual('Accuracy_in3', str(layer)) layer = tl.SequenceAccuracy() self.assertEqual('SequenceAccuracy_in3', str(layer)) layer = tl.CrossEntropyLoss() self.assertEqual('CrossEntropyLoss_in3', str(layer)) layer = tl.CrossEntropySum() self.assertEqual('CrossEntropySum_in3', str(layer))
def test_run_reversible_slots(self): """Tests that slots can be read and assigned in reversible trainer.""" layers = [tl.Dense(4), tl.Dup()] rev_layers = [tl.ReversibleHalfResidual(tl.Dense(4)), tl.ReversibleSwap()] loss_layer = tl.Serial(tl.Concatenate(), tl.Dense(4), tl.LogSoftmax(), tl.CrossEntropyLoss()) trainer = optimizers.ReversibleSerialTrainer( [(layers, rev_layers)], loss_layer, optimizers.Adam) slots = trainer.slots trainer.slots = slots self.assertEqual(slots, trainer.slots)
def train_model(NER, train_generator, eval_generator, train_steps=1, output_dir='model'): ''' Input: NER - the model you are building train_generator - The data generator for training examples eval_generator - The data generator for validation examples, train_steps - number of training steps output_dir - folder to save your model Output: training_loop - a trax supervised training Loop ''' ### START CODE HERE (Replace instances of 'None' with your code) ### train_task = training.TrainTask( train_generator, # A train data generator loss_layer=tl.CrossEntropyLoss(), # A cross-entropy loss function optimizer=trax.optimizers.Adam(0.01), # The adam optimizer ) eval_task = training.EvalTask( labeled_data=eval_generator, # A labeled data generator metrics=[tl.CrossEntropyLoss(), tl.Accuracy() ], # Evaluate with cross-entropy loss and accuracy n_eval_batches=10 # Number of batches to use on each evaluation ) training_loop = training.Loop( NER, # A model to train train_task, # A train task eval_task=eval_task, # The evaluation task output_dir=output_dir) # The output directory # Train with train_steps training_loop.run(n_steps=train_steps) ### END CODE HERE ### return training_loop
def loss(id_to_mask=None, has_weights=False): """Cross-entropy loss as scalar compatible with Trax masking.""" return layers.Serial( # Swap from (pred-obs, pred-reward, target-obs, target-reward) # to (pred-obs, target-obs, pred-reward, target-reward). layers.Parallel([], layers.Swap()), # Cross-entropy loss for obs, L2 loss on reward. layers.Parallel(layers.CrossEntropyLoss(id_to_mask, has_weights), layers.L2Loss(id_to_mask, has_weights)), # Add both losses. layers.Add(), # Zero out in this test. layers.Fn(lambda x: x * 0.0), )
def test_run_reversible_large_weights(self): """Runs the reversible trainer with a lot of weights to test memory use.""" # This test requires > 18GB RAM, only run on TPUs. It does pass on GPU # and CPU when you run it locally, but it's too big for unit-testing. ram_limited = True # Set to False to run this test locally. if fastmath.global_device_count() == 1 and ram_limited: return # Create inputs and rngs. inputs_batch = np.arange(8).reshape((2, 4)) targets_batch = inputs_batch labeled_batch = (inputs_batch, targets_batch, np.ones_like(targets_batch)) first_layer = tl.Serial(tl.Embedding(9, 16 * 1024), tl.Dup()) rng_init = fastmath.random.get_prng(12) rng_step = fastmath.random.get_prng(13) # Initialize layers. first_layer.init(labeled_batch, rng=rng_init) n_layers = 18 # 18 layers each 16K x 16K = 256M weights ~= 1GB, 18GB ram rev_layers = [] int_shape = shapes.ShapeDtype((2, 4), dtype=np.int32) shape = shapes.ShapeDtype((2, 4, 16 * 1024)) sig = (shape, shape) for _ in range(n_layers): layer = tl.ReversibleHalfResidual(tl.Dense(16 * 1024)) layer.init(sig, rng=rng_init) layer.weights = tl.on_cpu( layer.weights) # store weights in cpu memory rev_layers.append(layer) rev_layers.append(tl.ReversibleSwap()) loss_layer = tl.Serial(tl.Concatenate(), tl.Dense(9), tl.LogSoftmax(), tl.CrossEntropyLoss()) loss_layer.init((shape, shape, int_shape, int_shape)) optimizer_fn = optimizers.Adafactor # Make a step with reversible trainer. trainer = optimizers.ReversibleSerialTrainer( [(first_layer, rev_layers)], loss_layer, optimizer_fn) loss, _ = trainer.one_step(labeled_batch, rng_step) self.assertLess(float(loss.sum()), 10000.0) # Just to get the loss. # Set to true to run again, e.g., for profiling. run_twice = False if run_twice: t = time.time() loss, _ = trainer.one_step(labeled_batch, rng_step) self.assertLess(float(loss.sum()), 10000.0) # Just to get the loss. print('Took %.3f seconds to run, loss %s' % (time.time() - t, loss))
def loss(): """Cross-entropy loss as scalar compatible with Trax masking.""" ones = layers.Fn(lambda x: math.numpy.ones_like(x)) # pylint: disable=unnecessary-lambda return layers.Serial( # Swap from (pred-obs, pred-reward, target-obs, target-reward) # to (pred-obs, target-obs, pred-reward, target-reward). layers.Parallel([], layers.Swap()), # Duplicate target-obs and target-reward and make 1 to add weights. layers.Parallel([], layers.Branch([], ones)), layers.Parallel([], [], [], [], layers.Branch([], ones)), # Cross-entropy loss for obs, L2 loss on reward. layers.Parallel(layers.CrossEntropyLoss(), layers.L2Loss()), # Add both losses. layers.Add(), # Zero out in this test. layers.Fn(lambda x: x * 0.0), )
def test_run_reversible_weights_trainsfer_xprof(self): """Runs the reversible trainer and profiles weight transfer stats.""" run_this_test = False # We only run this test manually. if not run_this_test or fastmath.global_device_count( ) == 1: # TPU only return # Create inputs and rngs. inputs_batch = np.ones((1024, 128), dtype=np.int32) targets_batch = inputs_batch labeled_batch = (inputs_batch, targets_batch, np.ones_like(targets_batch)) first_layer = tl.Serial(tl.Embedding(4, 1024), tl.Dup()) rng_init = fastmath.random.get_prng(12) rng_step = fastmath.random.get_prng(13) # Initialize layers. first_layer.init(labeled_batch, rng=rng_init) n_layers = 6 rev_layers = [] int_shape = shapes.ShapeDtype((1024, 128), dtype=np.int32) shape = shapes.ShapeDtype((1024, 128, 1024)) sig = (shape, shape) for _ in range(n_layers): layer = tl.ReversibleHalfResidual(tl.Dense(1024)) layer.init(sig, rng=rng_init) layer.weights = tl.on_cpu( layer.weights) # store weights in cpu memory rev_layers.append(layer) rev_layers.append(tl.ReversibleSwap()) loss_layer = tl.Serial(tl.Concatenate(), tl.Dense(9), tl.LogSoftmax(), tl.CrossEntropyLoss()) loss_layer.init((shape, shape, int_shape, int_shape)) optimizer_fn = optimizers.SGD # Make a step with reversible trainer. trainer = optimizers.ReversibleSerialTrainer( [(first_layer, rev_layers)], loss_layer, optimizer_fn) loss, _ = trainer.one_step(labeled_batch, rng_step) self.assertLess(float(loss.sum()), 10000.0) # Just to get the loss. # We profile here. t = time.time() loss, _ = trainer.one_step(labeled_batch, rng_step) self.assertLess(float(loss.sum()), 10000.0) # Just to get the loss. print('Took %.3f seconds to run, loss %s' % (time.time() - t, loss))
def test_call_and_grad(self): layer = tl.Serial(tl.Branch(tl.Embedding(3, 4), tl.PaddingMask()), sparsity.Favor(d_feature=4, n_heads=2), tl.Select([0], n_in=2), tl.LogSoftmax(), tl.CrossEntropyLoss()) x = np.ones((1, 2), dtype=np.int32) w = np.ones_like(x).astype(np.float32) x_sig = shapes.signature(x) w_sig = shapes.signature(w) layer.init((x_sig, x_sig, w_sig)) y = layer((x, x, w)) self.assertEqual(y.shape, ()) state = layer.state rng = fastmath.random.get_prng(0) fwd = lambda weights, inp: layer.pure_fn(inp, weights, state, rng=rng)[ 0] g = fastmath.grad(fwd)(layer.weights, (x, x, w)) self.assertEqual(g[0][1][0].shape, (3, 4))
def test_names(self): layer = tl.L2Loss() self.assertEqual('L2Loss_in3', str(layer)) layer = tl.BinaryClassifier() self.assertEqual('BinaryClassifier', str(layer)) layer = tl.MulticlassClassifier() self.assertEqual('MulticlassClassifier', str(layer)) layer = tl.Accuracy() self.assertEqual('Accuracy_in3', str(layer)) layer = tl.SequenceAccuracy() self.assertEqual('SequenceAccuracy_in3', str(layer)) layer = tl.BinaryCrossEntropyLoss() self.assertEqual('BinaryCrossEntropyLoss_in3', str(layer)) layer = tl.CrossEntropyLoss() self.assertEqual('CrossEntropyLoss_in3', str(layer)) layer = tl.BinaryCrossEntropySum() self.assertEqual('BinaryCrossEntropySum_in3', str(layer)) layer = tl.CrossEntropySum() self.assertEqual('CrossEntropySum_in3', str(layer))
def test_run_sharded_terraformer(self): """Runs Terraformer with sharded weights (only on 2+-device systems).""" if fastmath.local_device_count() == 1: return base.N_WEIGHTS_SHARDS = fastmath.local_device_count() inputs_batch = np.arange(8).reshape((2, 4)) + 1 targets_batch = 2 * inputs_batch labeled_batch = (inputs_batch, targets_batch, np.ones_like(targets_batch)) int_sig = shapes.ShapeDtype((2, 4), dtype=np.int32) input_sig = (int_sig, int_sig, int_sig) # We want to test rng propagation too, so adding some dropout layers. model = terraformer.ConfigurableTerraformer( 20, d_model=8, d_ff=32, n_heads=1, dropout=0.0, n_encoder_layers=2, n_decoder_layers=2, ff_sparsity=(4, 8, 0.0, 1.0), encoder_attention_type=tl.Attention, encoder_decoder_attention_type=tl.CausalAttention, pos_type=None, reversible_encoder=True) loss = tl.Serial(tl.LogSoftmax(), tl.CrossEntropyLoss()) model_with_loss = tl.Serial(model, loss) rng_init = fastmath.random.get_prng(12) model_with_loss.init(input_sig, rng=rng_init) # Make a step with the trainer. optimizer = optimizers.Adafactor(0.01) split_w = fastmath.nested_map( lambda x: x[0], tl.shard(model_with_loss.weights, base.N_WEIGHTS_SHARDS)) optimizer.tree_init(split_w) trainer = optimizers.Trainer(model_with_loss, optimizer) rng_step1 = fastmath.random.get_prng(7) trainer.one_step(labeled_batch, rng_step1) # Reset shards back to default. base.N_WEIGHTS_SHARDS = 1
def test_reset_twice(self, backend): with fastmath.use_backend(backend): n_classes = 4 model_fn = functools.partial(models.MLP, layer_widths=(16, 16, n_classes)) inputs = _test_inputs(n_classes) trainer = trainer_lib.Trainer( model=model_fn, loss_fn=tl.Serial(tl.LogSoftmax(), tl.CrossEntropyLoss()), optimizer=trax_opt.SM3, lr_schedule=lr.multifactor(), inputs=inputs, ) output_dir1 = self.create_tempdir(name='output_dir1').full_path trainer.reset(output_dir1) trainer.evaluate(1) output_dir2 = self.create_tempdir(name='output_dir2').full_path trainer.reset(output_dir2) trainer.evaluate(1)
def test_reset_twice(self, backend_name): if xla_bridge.device_count() > 1 and backend_name == 'tf': self.skipTest("tf-numpy backend doesn't support multi-devices yet.") with math.use_backend(backend_name), self.tmp_dir() as output_dir1, \ self.tmp_dir() as output_dir2: n_classes = 4 model_fn = functools.partial( models.MLP, d_hidden=16, n_output_classes=n_classes) inputs = test_inputs(n_classes) trainer = trainer_lib.Trainer( model=model_fn, loss_fn=layers.CrossEntropyLoss(), optimizer=trax_opt.SM3, lr_schedule=lr.MultifactorSchedule, inputs=inputs, ) trainer.reset(output_dir1) trainer.evaluate(1) trainer.reset(output_dir2) trainer.evaluate(1)
def test_run_reversible_large_weights(self): """Runs the reversible trainer with a lot of weights to test memory use.""" # This test requires > 20GB RAM, only run on TPUs. It does pass on GPU # and CPU when you run it locally, but it's too big for unit-testing. ram_limited = True # Set to False to run this test locally. if fastmath.device_count() == 1 and ram_limited: return # Create inputs and rngs. inputs_batch = np.arange(8).reshape((2, 4)) targets_batch = inputs_batch labeled_batch = (inputs_batch, targets_batch, np.ones_like(targets_batch)) first_layer = tl.Serial(tl.Embedding(9, 16*1024), tl.Dup()) rng_init = fastmath.random.get_prng(12) rng_step = fastmath.random.get_prng(13) # Initialize layers. first_layer.init(labeled_batch, rng=rng_init) n_layers = 20 # 20 layers each 16K x 16K = 256M weights ~= 1GB, 20GB ram rev_layers = [] int_shape = shapes.ShapeDtype((2, 4), dtype=np.int32) shape = shapes.ShapeDtype((2, 4, 16*1024)) sig = (shape, shape) for _ in range(n_layers): layer = tl.ReversibleHalfResidual(tl.Dense(16*1024)) layer.init(sig, rng=rng_init) layer.weights = tl.on_cpu(layer.weights) # store weights in cpu memory rev_layers.append(layer) rev_layers.append(tl.ReversibleSwap()) loss_layer = tl.Serial(tl.Concatenate(), tl.Dense(9), tl.LogSoftmax(), tl.CrossEntropyLoss()) loss_layer.init((shape, shape, int_shape, int_shape)) optimizer_fn = optimizers.Adafactor # Make a step with reversible trainer. trainer = optimizers.ReversibleSerialTrainer( first_layer, rev_layers, loss_layer, optimizer_fn) trainer.one_step(labeled_batch, rng_step)