def train(self): """Train a model. NOTE: modifies TrainState in place. - parameters of the Editor and Optimizer are updated - train_steps is updated - random number generator states are updated at every checkpoint """ # TODO(kelvin): do something to preserve random state upon reload? train_state = self.train_state examples = self._examples config = self.config workspace = self.workspace with random_state(self.train_state.random_state): editor = train_state.model train_batches = similar_size_batches(examples.train, config.optim.batch_size) editor.test_batch(train_batches[0]) best_exact_match_score = 0.0 while True: random.shuffle(train_batches) loss = 0 for batch in verboserate(train_batches, desc='Streaming training examples'): loss, _, _ = editor.loss(batch) finite_grads, grad_norm = self._take_grad_step( train_state, loss) if not finite_grads: train_state.save(workspace.nan_checkpoints) examples_path = join( workspace.nan_checkpoints, '{}.examples'.format(train_state.train_steps)) with open(examples_path, 'w') as f: pickle.dump(batch, f) print 'Gradient was NaN/inf on step {}.'.format( train_state.train_steps) step = train_state.train_steps # run periodic evaluation and saving if step != 0: if step % 10 == 0: self._update_metadata(train_state) if step % config.timing.eval_small == 0: self.evaluate(step, big_eval=False) self.tb_logger.log_value('grad_norm', grad_norm, step) if step % config.timing.eval_big == 0: train_stats, valid_stats = self.evaluate( step, big_eval=True) # train_stats, valid_stats = self.evaluate(step, big_eval=False) exact_match_score = valid_stats[('big', 'exact_match', 'valid')] self.checkpoints.save(train_state) if step >= config.optim.max_iters: return
def train(self): """Train a model. NOTE: modifies TrainState in place. - parameters of the Editor and Optimizer are updated - train_steps is updated - random number generator states are updated at every checkpoint """ with random_state(self.train_state.random_state): self.train_vae() lsh = self.setup_ret() self.lsh = lsh
def train(self): config = self.config train_state = self.train_state model, optimizer = train_state.model, train_state.optimizer # group into training batches train_batches = similar_size_batches(self.examples.train, batch_size=config.optim.batch_size, size=lambda x: len(x.output_words)) def batch_generator(): while True: # WARNING: random state of train state does not exactly restore state anymore, due to this shuffle random.shuffle(train_batches) for batch in verboserate(train_batches, desc='Streaming example batches'): yield batch with random_state(train_state.random_state): for batch in batch_generator(): # take gradient step loss = model.loss(batch, config.optim.num_negatives) finite_grads = self._take_grad_step(train_state, loss) # TODO: clip gradient? train_steps = train_state.train_steps if not finite_grads: print 'WARNING: grads not finite at step {}'.format(train_steps) self._update_metadata(train_state) # run periodic evaluation and saving if train_steps % config.eval.eval_steps == 0: self._evaluate(self.examples, big_eval=False) if train_steps % config.eval.big_eval_steps == 0: self._evaluate(self.examples, big_eval=True) if train_steps % config.eval.save_steps == 0: self.checkpoints.save(train_state) if train_steps >= config.optim.max_iters: return