def _setup(self, config): inject_tuned_hyperparameters(config, config) os.chdir(os.path.dirname(os.path.realpath(__file__))) print('Trainable got the following config after injection', config) self.config = config self.device = self.config['device'] self.exp, self.model, self.train_dataloader, self.eval_dataloader = setup_training( self.config) self.exp.set_name(config['experiment_name'] + self._experiment_id) self.exp_name = config['experiment_name'] + self._experiment_id self.exp.send_notification(title='Experiment ' + str(self._experiment_id) + ' ended') self.train_data_iter = iter(self.train_dataloader) self.model = self.model.to(self.device) self.model.train() n_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad) log_dict = flatten_dict(config) log_dict.update({'trainable_params': n_params}) self.exp.log_parameters(log_dict) self.optimizers = get_optimizers(self.model, self.config) self.evaluator = Evaluation(self.eval_dataloader, self.config) self.num_examples = 0 self.batch_idx = 0 self.epoch = 1 self.ewma = EWMA(beta=0.75) self.last_accu = -1.0 self.max_accu = -1.0 self.back_prop_every_n_batches = config['training'][ 'back_prop_every_n_batches'] self.checkpoint_best = config['training']['checkpoint_best']
def __init__(self, params, step_size=1, b1=0.9, b2=0.999, bp1=0, decay=0, power=1, biased_g1=False): """Initializes the optimizer.""" self.params = params self.step_size = step_size self.decay, self.power = decay, power self.i = 1 self.xy = np.zeros(2, dtype=np.int32) self.g1 = EWMA.like(params, b1, correct_bias=not biased_g1) self.g2 = EWMA.like(params, b2) self.p1 = EWMA.like(params, bp1)
def test_like(self): arr = np.float32(np.eye(4)) avg = EWMA.like(arr) out = avg.update(np.eye(4)) self.assertEqual(out.shape, arr.shape) self.assertEqual(out.dtype, arr.dtype)
def test_get_est(self): avg = EWMA(beta=0.5) avg.update(0) out = avg.get_est(1) self.assertEqual(out, 2 / 3)
def test_returns_array_nan(self): avg = EWMA((4, 4)) out = avg.get() self.assertTrue(np.isnan(out).all())
def test_returns_scalar_nan(self): avg = EWMA() out = avg.get() self.assertNotEqual(out, out)
def test_raises_on_wrong_type(self): avg = EWMA((4, 4)) with self.assertRaises(TypeError): avg.update('abc')
def test_raises_on_wrong_shape(self): avg = EWMA((4, 4)) with self.assertRaises(ValueError): avg.update(np.eye(3))
def test_returns_float(self): avg = EWMA() avg.update(1) self.assertEqual(type(avg.get()), float)
def test_array_once(self): avg = EWMA((4, 4)) avg.update(np.eye(4)) self.assertTrue((avg.get() == np.eye(4)).all())
def test_scalar_twice(self): avg = EWMA(beta=0.5) avg.update(0) avg.update(1) self.assertEqual(avg.get(), 2 / 3)
def test_scalar_once(self): avg = EWMA() avg.update(1) self.assertEqual(avg.get(), 1)
class TuneTrainable(Trainable): def _setup(self, config): inject_tuned_hyperparameters(config, config) os.chdir(os.path.dirname(os.path.realpath(__file__))) print('Trainable got the following config after injection', config) self.config = config self.device = self.config['device'] self.exp, self.model, self.train_dataloader, self.eval_dataloader = setup_training( self.config) self.exp.set_name(config['experiment_name'] + self._experiment_id) self.exp_name = config['experiment_name'] + self._experiment_id self.exp.send_notification(title='Experiment ' + str(self._experiment_id) + ' ended') self.train_data_iter = iter(self.train_dataloader) self.model = self.model.to(self.device) self.model.train() n_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad) log_dict = flatten_dict(config) log_dict.update({'trainable_params': n_params}) self.exp.log_parameters(log_dict) self.optimizers = get_optimizers(self.model, self.config) self.evaluator = Evaluation(self.eval_dataloader, self.config) self.num_examples = 0 self.batch_idx = 0 self.epoch = 1 self.ewma = EWMA(beta=0.75) self.last_accu = -1.0 self.max_accu = -1.0 self.back_prop_every_n_batches = config['training'][ 'back_prop_every_n_batches'] self.checkpoint_best = config['training']['checkpoint_best'] def get_batch(self): try: batch = next(self.train_data_iter) return batch except StopIteration: self.train_data_iter = iter(self.train_dataloader) batch = next(self.train_data_iter) self.batch_idx = 0 self.epoch += 1 return batch def _train(self): total_log_step_loss = 0 total_log_step_train_accu = 0 total_log_step_n = 0 [opt.zero_grad() for opt in self.optimizers] while True: batch = self.get_batch() self.batch_idx += 1 self.num_examples += len(batch[0]) batch = (batch[0].to(self.device), batch[1].to(self.device)) loss, train_accu = training_step( batch, self.model, self.optimizers, step=(self.batch_idx % self.back_prop_every_n_batches == 0)) total_log_step_loss += loss.cpu().detach().numpy() total_log_step_train_accu += train_accu total_log_step_n += 1 if self.batch_idx % self.config['training'][ 'log_every_n_batches'] == 0: avg_loss = total_log_step_loss / total_log_step_n avg_accu = total_log_step_train_accu / total_log_step_n total_log_step_n = 0 print(f'{Fore.YELLOW}Total number of seen examples:', self.num_examples, 'Average loss of current log step:', avg_loss, 'Average train accuracy of current log step:', avg_accu, f"{Style.RESET_ALL}") self.exp.log_metric('train_loss', avg_loss, step=self.num_examples, epoch=self.epoch) self.exp.log_metric('train_accuracy', avg_accu, step=self.num_examples, epoch=self.epoch) total_log_step_loss = 0 total_log_step_train_accu = 0 if (self.batch_idx + 1) % self.config['training']['eval_every_n_batches'] == 0: results, assets, image_fns = self.evaluator.eval_model( self.model) print(self.config['tune']['discriminating_metric'], results[self.config['tune']['discriminating_metric']]) self.exp.log_metrics(results, step=self.num_examples, epoch=self.epoch) [ self.exp.log_asset_data(asset, step=self.num_examples) for asset in assets ] [ self.exp.log_image(fn, step=self.num_examples) for fn in image_fns ] accu_diff_avg = abs( results[self.config['tune']['discriminating_metric']] - self.ewma.get()) accu_diff_cons = abs( results[self.config['tune']['discriminating_metric']] - self.last_accu) no_change_in_accu = 1 if accu_diff_avg < 0.0005 and accu_diff_cons < 0.002 and self.num_examples > 70000 else 0 self.ewma.update( results[self.config['tune']['discriminating_metric']]) self.last_accu = results[self.config['tune'] ['discriminating_metric']] if self.max_accu < results[self.config['tune'] ['discriminating_metric']]: self.max_accu = results[self.config['tune'] ['discriminating_metric']] if self.checkpoint_best: self.save_checkpoint('checkpoints', self.exp_name + '.pt') print( f'{Fore.GREEN}New best model saved.{Style.RESET_ALL}' ) self.exp.log_metric('max_accuracy', self.max_accu, step=self.num_examples, epoch=self.epoch) training_results = { self.config['tune']['discriminating_metric']: self.max_accu, 'num_examples': self.num_examples, 'no_change_in_accu': no_change_in_accu } return training_results def _save(self, checkpoint_dir): return self.save_checkpoint(checkpoint_dir, 'checkpoint_file.pt') def save_checkpoint(self, checkpoint_dir, fname='checkpoint_file.pt'): print(f'{Fore.CYAN}Saving model ...{Style.RESET_ALL}') save_dict = {'model_state_dict': self.model.state_dict()} for i, optimizer in enumerate(self.optimizers): save_dict['op_' + str(i) + '_state_dict'] = optimizer.state_dict() torch.save(save_dict, os.path.join(checkpoint_dir, fname)) return os.path.join(checkpoint_dir, fname) def _restore(self, checkpoint_path): checkpoint = torch.load(checkpoint_path) self.model.load_state_dict(checkpoint['model_state_dict']) for i, optimizer in enumerate(self.optimizers): optimizer.load_state_dict(checkpoint['op_' + str(i) + '_state_dict']) def stop(self): results, assets, image_fns = self.evaluator.eval_model( self.model, finished_training=True) self.exp.log_metrics(results, step=self.num_examples, epoch=self.epoch) [ self.exp.log_asset_data(asset, step=self.num_examples) for asset in assets ] [self.exp.log_image(fn, step=self.num_examples) for fn in image_fns] return super().stop()