def test_trainer_maxq(self): env = Env(self.state_dims, self.action_dims) env.seed(42) maxq_parameters = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters( gamma=0.99, target_update_rate=1.0, reward_burnin=100, maxq_learning=True, ), training=TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=self.minibatch_size, learning_rate=1.0, optimizer="ADAM", ), ) maxq_trainer = DiscreteActionTrainer(maxq_parameters, env.normalization) # predictor = maxq_trainer.predictor() logger.info("Generating constant_reward MDPs..") states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions = env.generate_samples_discrete( self.num_samples) logger.info("Preprocessing constant_reward MDPs..") tdps = env.preprocess_samples_discrete( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, self.minibatch_size, ) for epoch in range(self.epochs): logger.info("Training.. " + str(epoch)) for tdp in tdps: maxq_trainer.train_numpy(tdp, None) logger.info(" ".join([ "Training epoch", str(epoch), "average q values", str(np.mean(workspace.FetchBlob(maxq_trainer.q_score_output))), "td_loss", str(workspace.FetchBlob(maxq_trainer.loss_blob)), ])) # Q value should converge to very close to 100 avg_q_value_after_training = np.mean( workspace.FetchBlob(maxq_trainer.q_score_output)) self.assertLess(avg_q_value_after_training, 101) self.assertGreater(avg_q_value_after_training, 99)
def test_pure_q_learning_all_cheat(self): q_learning_parameters = DiscreteActionModelParameters( actions=self._env.ACTIONS, rl=self._rl_parameters_all_cheat_maxq, training=TrainingParameters( layers=[self._env.width * self._env.height, 1], activations=['linear'], minibatch_size=self.minibatch_size, learning_rate=0.05, optimizer='SGD', lr_policy='fixed', ) ) trainer = DiscreteActionTrainer( q_learning_parameters, self._env.normalization, ) predictor = trainer.predictor() policy = _build_policy(self._env, predictor, 1) initial_state = self._env.reset() iteration_result = _collect_samples( self._env, policy, 20000, initial_state ) num_iterations = 50 for _ in range(num_iterations): tdps = self._env.preprocess_samples( iteration_result.states, iteration_result.actions, iteration_result.rewards, iteration_result.next_states, iteration_result.next_actions, iteration_result.is_terminals, iteration_result.possible_next_actions, None, self.minibatch_size, ) for tdp in tdps: trainer.train_numpy(tdp, None) initial_state = self._env.reset() policy = _build_policy(self._env, predictor, 0.1) iteration_result = _collect_samples( self._env, policy, 20000, initial_state ) policy = _build_policy(self._env, predictor, 0) initial_state = self._env.reset() iteration_result = _collect_samples( self._env, policy, 1000, initial_state ) # 100% should be cheat. Will fix in the future. self.assertGreater( np.sum(np.array(iteration_result.actions) == 'C'), 800 )
def test_trainer_maxq(self): environment = Gridworld() maxq_sarsa_parameters = DiscreteActionModelParameters( actions=environment.ACTIONS, rl=RLParameters( gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True, ), training=TrainingParameters( layers=[-1, 1], activations=["linear"], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer="ADAM", ), ) # construct the new trainer that using maxq maxq_trainer = DiscreteActionTrainer( maxq_sarsa_parameters, environment.normalization ) samples = environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator = GridworldEvaluator(environment, True) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertGreater(evaluator.mc_loss[-1], 0.3) for _ in range(5): for tdp in tdps: maxq_trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1) self.assertGreater( evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2] )
def test_trainer_maxq(self): environment = Gridworld() maxq_sarsa_parameters = DiscreteActionModelParameters( actions=environment.ACTIONS, rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True), training=TrainingParameters( layers=[-1, 1], activations=['linear'], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer='ADAM', )) # construct the new trainer that using maxq maxq_trainer = DiscreteActionTrainer( maxq_sarsa_parameters, environment.normalization, ) states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) evaluator = GridworldEvaluator(environment, True) print("Pre-Training eval", evaluator.evaluate(predictor)) self.assertGreater(evaluator.evaluate(predictor), 0.3) for _ in range(2): for tdp in tdps: maxq_trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print("Post-Training eval", evaluator.evaluate(predictor)) self.assertLess(evaluator.evaluate(predictor), 0.1)