Beispiel #1
0
    def test_trainer_maxq(self):
        env = Env(self.state_dims, self.action_dims)
        env.seed(42)
        maxq_parameters = DiscreteActionModelParameters(
            actions=env.actions,
            rl=RLParameters(
                gamma=0.99,
                target_update_rate=1.0,
                reward_burnin=100,
                maxq_learning=True,
            ),
            training=TrainingParameters(
                layers=self.layers,
                activations=self.activations,
                minibatch_size=self.minibatch_size,
                learning_rate=1.0,
                optimizer="ADAM",
            ),
        )
        maxq_trainer = DiscreteActionTrainer(maxq_parameters,
                                             env.normalization)
        # predictor = maxq_trainer.predictor()

        logger.info("Generating constant_reward MDPs..")

        states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions = env.generate_samples_discrete(
            self.num_samples)

        logger.info("Preprocessing constant_reward MDPs..")

        tdps = env.preprocess_samples_discrete(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            self.minibatch_size,
        )

        for epoch in range(self.epochs):
            logger.info("Training.. " + str(epoch))
            for tdp in tdps:
                maxq_trainer.train_numpy(tdp, None)
            logger.info(" ".join([
                "Training epoch",
                str(epoch),
                "average q values",
                str(np.mean(workspace.FetchBlob(maxq_trainer.q_score_output))),
                "td_loss",
                str(workspace.FetchBlob(maxq_trainer.loss_blob)),
            ]))

        # Q value should converge to very close to 100
        avg_q_value_after_training = np.mean(
            workspace.FetchBlob(maxq_trainer.q_score_output))

        self.assertLess(avg_q_value_after_training, 101)
        self.assertGreater(avg_q_value_after_training, 99)
Beispiel #2
0
    def test_pure_q_learning_all_cheat(self):
        q_learning_parameters = DiscreteActionModelParameters(
            actions=self._env.ACTIONS,
            rl=self._rl_parameters_all_cheat_maxq,
            training=TrainingParameters(
                layers=[self._env.width * self._env.height, 1],
                activations=['linear'],
                minibatch_size=self.minibatch_size,
                learning_rate=0.05,
                optimizer='SGD',
                lr_policy='fixed',
            )
        )

        trainer = DiscreteActionTrainer(
            q_learning_parameters,
            self._env.normalization,
        )

        predictor = trainer.predictor()

        policy = _build_policy(self._env, predictor, 1)
        initial_state = self._env.reset()
        iteration_result = _collect_samples(
            self._env, policy, 20000, initial_state
        )
        num_iterations = 50
        for _ in range(num_iterations):
            tdps = self._env.preprocess_samples(
                iteration_result.states,
                iteration_result.actions,
                iteration_result.rewards,
                iteration_result.next_states,
                iteration_result.next_actions,
                iteration_result.is_terminals,
                iteration_result.possible_next_actions,
                None,
                self.minibatch_size,
            )
            for tdp in tdps:
                trainer.train_numpy(tdp, None)
            initial_state = self._env.reset()
            policy = _build_policy(self._env, predictor, 0.1)
            iteration_result = _collect_samples(
                self._env, policy, 20000, initial_state
            )
        policy = _build_policy(self._env, predictor, 0)
        initial_state = self._env.reset()
        iteration_result = _collect_samples(
            self._env, policy, 1000, initial_state
        )
        # 100% should be cheat.  Will fix in the future.
        self.assertGreater(
            np.sum(np.array(iteration_result.actions) == 'C'), 800
        )
Beispiel #3
0
    def test_trainer_maxq(self):
        environment = Gridworld()
        maxq_sarsa_parameters = DiscreteActionModelParameters(
            actions=environment.ACTIONS,
            rl=RLParameters(
                gamma=DISCOUNT,
                target_update_rate=0.5,
                reward_burnin=10,
                maxq_learning=True,
            ),
            training=TrainingParameters(
                layers=[-1, 1],
                activations=["linear"],
                minibatch_size=self.minibatch_size,
                learning_rate=0.01,
                optimizer="ADAM",
            ),
        )
        # construct the new trainer that using maxq
        maxq_trainer = DiscreteActionTrainer(
            maxq_sarsa_parameters, environment.normalization
        )

        samples = environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(samples, self.minibatch_size)
        evaluator = GridworldEvaluator(environment, True)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.reward_doubly_robust[-1],
        )
        self.assertGreater(evaluator.mc_loss[-1], 0.3)

        for _ in range(5):
            for tdp in tdps:
                maxq_trainer.train_numpy(tdp, None)

        evaluator.evaluate(predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.reward_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)

        self.assertGreater(
            evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2]
        )
    def test_trainer_maxq(self):
        environment = Gridworld()
        maxq_sarsa_parameters = DiscreteActionModelParameters(
            actions=environment.ACTIONS,
            rl=RLParameters(gamma=DISCOUNT,
                            target_update_rate=0.5,
                            reward_burnin=10,
                            maxq_learning=True),
            training=TrainingParameters(
                layers=[-1, 1],
                activations=['linear'],
                minibatch_size=self.minibatch_size,
                learning_rate=0.01,
                optimizer='ADAM',
            ))
        # construct the new trainer that using maxq
        maxq_trainer = DiscreteActionTrainer(
            maxq_sarsa_parameters,
            environment.normalization,
        )
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )
        evaluator = GridworldEvaluator(environment, True)
        print("Pre-Training eval", evaluator.evaluate(predictor))
        self.assertGreater(evaluator.evaluate(predictor), 0.3)

        for _ in range(2):
            for tdp in tdps:
                maxq_trainer.train_numpy(tdp, None)
            evaluator.evaluate(predictor)

        print("Post-Training eval", evaluator.evaluate(predictor))
        self.assertLess(evaluator.evaluate(predictor), 0.1)