Beispiel #1
0
    def test_dqn_1(self):
        def on_step_end(agent, reward, observation, done, action):
            """Callback to print stuff to console"""
            if agent.total_step % 1000 == 0:
                print('test_dqn_1', agent.total_step)
            if done:
                print('episode terminated at', agent.total_step)

        env = gym.make('MountainCar-v0').env  # .env to remove 200 step limit
        env.seed(self.seed)

        #
        #   Keras DQN model
        #
        q_model = tf.keras.models.Sequential()
        q_model.add(
            tf.keras.layers.Dense(units=256, activation='relu', input_dim=2))
        q_model.add(tf.keras.layers.Dense(units=256, activation='relu'))
        q_model.add(tf.keras.layers.Dense(units=3, activation='linear'))
        q_model.compile(loss='mse',
                        optimizer=tf.keras.optimizers.RMSprop(lr=0.00025))

        # Configure agent
        agent = rl.AgentDQN(state_space=env.observation_space,
                            action_space=env.action_space,
                            discount=0.99,
                            start_learning_at=22000,
                            memory=rl.MemoryDQN(max_len=10000,
                                                batch_size=64,
                                                enable_pmr=False,
                                                initial_pmr_error=1000.0),
                            q_fun_approx=rl.QFunctKeras(model=q_model),
                            policy=rl.PolicyEpsGreedy(expl_start=False,
                                                      nb_rand_steps=22000,
                                                      e_rand_start=1.0,
                                                      e_rand_target=0.1,
                                                      e_rand_decay=1 / 10000))

        agent.register_callback('on_step_end', on_step_end)

        # Main train loop
        rl.train_agent(env=env, agent=agent, total_steps=23000)

        # This is used to test for any numerical discrepancy between runs
        fp, ws, st, act, rew, done = agent.get_fingerprint()
        print('FINGERPRINT:', fp)
        print('  wegight sum:', ws)
        print('  st, act, rew, done:', st, act, rew, done)

        self.assertEqual(fp, -8093.633372464128)
        self.assertEqual(ws, 3562.30959045887)
        self.assertEqual(st, -11822.942962922998)
        self.assertEqual(act, 23165)
        self.assertEqual(rew, -22999.0)
        self.assertEqual(done, 1)
Beispiel #2
0
    def test_30_run_aggregate_1(self):
        def on_step_end(agent, reward, observation, done, action):
            if agent.total_step % 1000 == 0:
                print('test_30_run_aggregate_1', agent.total_step)
            if done:
                print('episode terminated at', agent.total_step)

        env = gym.make('MountainCar-v0').env
        env.seed(self.seed)

        agent = rl.Agent(state_space=env.observation_space,
                         action_space=env.action_space,
                         discount=0.99,
                         start_learning_at=0,
                         memory=None,
                         q_fun_approx=rl.AggregateApproximator(step_size=0.3,
                                                               bins=[64, 64],
                                                               init_val=0),
                         policy=rl.QMaxPolicy(expl_start=False,
                                              nb_rand_steps=0,
                                              e_rand_start=0.1,
                                              e_rand_target=0.1,
                                              e_rand_decay=1 / 10000))

        agent.register_callback('on_step_end', on_step_end)

        rl.train_agent(env=env, agent=agent, total_steps=30000)

        # This is used to test for any numerical discrepancy between runs
        fp, ws, st, act, rew, done = agent.get_fingerprint()
        print('FINGERPRINT:', fp)
        print('  wegight sum:', ws)
        print('  st, act, rew, done:', st, act, rew, done)

        self.assertEqual(fp, -24059.666698709698)
        self.assertEqual(ws, -8850.374069905585)
        self.assertEqual(st, -15178.292628804113)
        self.assertEqual(act, 29967)
        self.assertEqual(rew, -29999.0)
        self.assertEqual(done, 1)
Beispiel #3
0
    def test_20_run_tile_1(self):
        def on_step_end(agent, reward, observation, done, action):
            if agent.total_step % 1000 == 0:
                print('test_20_run_tile_1', agent.total_step)
            if done:
                print('episode terminated at', agent.total_step)

        env = gym.make('MountainCar-v0').env
        env.seed(self.seed)

        agent = rl.Agent(state_space=env.observation_space,
                         action_space=env.action_space,
                         discount=0.99,
                         start_learning_at=0,
                         memory=None,
                         q_fun_approx=rl.TilesApproximator(step_size=0.3,
                                                           num_tillings=8,
                                                           init_val=0),
                         policy=rl.QMaxPolicy(expl_start=False,
                                              nb_rand_steps=0,
                                              e_rand_start=1.0,
                                              e_rand_target=0.1,
                                              e_rand_decay=1 / 10000))

        agent.register_callback('on_step_end', on_step_end)

        rl.train_agent(env=env, agent=agent, total_steps=5000)

        # This is used to test for any numerical discrepancy between runs
        fp, ws, st, act, rew, done = agent.get_fingerprint()
        print('FINGERPRINT:', fp)
        print('  wegight sum:', ws)
        print('  st, act, rew, done:', st, act, rew, done)

        self.assertEqual(fp, -3667.665666738285)
        self.assertEqual(ws, -1297.1708778794816)
        self.assertEqual(st, -2430.494788858803)
        self.assertEqual(act, 5058)
        self.assertEqual(rew, -4999.0)
        self.assertEqual(done, 1)
Beispiel #4
0
    def main(self):

        args = rl.util.parse_common_args()
        rl.util.try_freeze_random_seeds(args.seed, args.reproducible)

        #
        #   Environment
        #
        # Environment outputs 3-tuple: cos(ang), sin(ang), angular-velocity
        # we translate that to 2-tuple: angle [-pi, pi], ang-vel [-8.0, 8.0]
        # so we can plot 2d action space nicely
        #
        # Environment expect continous 1-tuple action representing torque
        # in range [-2.0, 2.0], but our agent outputs categorical action 0-4
        # so we need to tranlate that to torque
        # this is becouse continous actions are not implemented yet
        def obs_trans(obs):
            """Translate from 3d obs space to 2d (for easier plotting)"""
            theta = np.arctan2(obs[1], obs[0])
            vel = obs[2]
            return np.array([theta, vel])

        def act_trans(act):
            """Translate from categorical actions to continous"""
            torques = [-2.0, -0.5, 0.0, 0.5, 2.0]
            return np.array([torques[act]])

        self.env = rl.util.EnvTranslator(env=gym.make('Pendulum-v0'),
                                         observation_space=gym.spaces.Box(
                                             low=np.array([-np.pi, -8.0]),
                                             high=np.array([np.pi, 8.0])),
                                         observation_translator=obs_trans,
                                         action_space=gym.spaces.Discrete(5),
                                         action_translator=act_trans,
                                         reward_translator=None)

        self.env.seed(args.seed)

        #
        #   Agent
        #
        agent = rl.AgentQ(state_space=self.env.observation_space,
                          action_space=self.env.action_space,
                          discount=0.99,
                          q_fun_approx=rl.QFunctTiles(step_size=0.3,
                                                      num_tillings=16,
                                                      init_val=0),
                          policy=rl.PolicyEpsGreedy(expl_start=False,
                                                    nb_rand_steps=0,
                                                    e_rand_start=0.0,
                                                    e_rand_target=0.0,
                                                    e_rand_decay=1 / 10000))

        #
        #   Plotting
        #
        # Need to re-think how plotting works
        if args.plot:
            fig1 = plt.figure()
            self.plotter = rl.util.Plotter(
                realtime_plotting=True,
                plot_every=1000,
                disp_len=1000,
                nb_actions=self.env.action_space.n,
                figures=(fig1, ),
                ax_qmax_wf=fig1.add_subplot(2, 4, 1, projection='3d'),
                ax_qmax_im=fig1.add_subplot(2, 4, 2),
                ax_policy=fig1.add_subplot(2, 4, 3),
                ax_trajectory=fig1.add_subplot(2, 4, 4),
                ax_stats=None,
                ax_memory=None,
                ax_q_series=None,
                ax_reward=fig1.add_subplot(2, 1, 2),
            )
            self.plotter.set_state_action_spaces(
                self.env.observation_space.low,
                self.env.observation_space.high,
                h_line=0.0,
                v_line=0.0)

        #
        #   Logging
        #
        if args.logfile is not None or args.plot:
            self.logger = rl.util.Logger()

            self.logger.agent = rl.util.Log('Agent')
            self.logger.q_val = rl.util.Log('Q_Val')
            self.logger.env = rl.util.Log('Environment')
            self.logger.hist = rl.util.Log('History', 'All sates visited')
            self.logger.memory = rl.util.Log('Memory', 'Full memory dump')
            self.logger.approx = rl.util.Log('Approx', 'Approximator')
            self.logger.epsumm = rl.util.Log('Episodes')

            agent.log_episodes = self.logger.epsumm
            agent.log_hist = self.logger.hist
            agent.Q.install_logger(self.logger.q_val,
                                   log_every=1000,
                                   samples=(64, 64))

        #
        #   Callback
        #
        agent.register_callback('on_step_end', self.on_step_end)

        #
        #   Runner
        #
        try:
            rl.train_agent(env=self.env,
                           agent=agent,
                           total_steps=1000000,
                           target_avg_reward=-200)
        finally:
            if args.logfile is not None:
                logger.save(args.logfile)
                print('Log saved')

        if self.plotter is not None:
            plt.show()
    def main(self):

        args = rl.util.parse_common_args()
        rl.util.try_freeze_random_seeds(args.seed, args.reproducible)

        #
        #   Environment
        #
        # .env at the end removes time limit, see:
        # https://stackoverflow.com/questions/42787924/
        # why-is-episode-done-after-200-time-steps-gym-environment-mountaincar
        self.env = gym.make('MountainCar-v0').env

        self.env.seed(args.seed)

        test_dqn = False
        if test_dqn:

            #
            #   Model
            #
            q_model = tf.keras.models.Sequential()
            q_model.add(tf.keras.layers.Dense(256, 'relu', input_dim=2))
            q_model.add(tf.keras.layers.Dense(256, 'relu'))
            q_model.add(tf.keras.layers.Dense(3, 'linear'))
            q_model.compile(loss='mse',
                            optimizer=tf.keras.optimizers.RMSprop(lr=0.00025))

            #
            #   Agent - DQN with memory
            #
            agent = rl.AgentDQN(state_space=self.env.observation_space,
                                action_space=self.env.action_space,
                                discount=0.99,
                                start_learning_at=100000,
                                memory=rl.MemoryDQN(max_len=100000,
                                                    batch_size=1024,
                                                    enable_pmr=False,
                                                    initial_pmr_error=1000.0),
                                q_fun_approx=rl.QFunctKeras(q_model),
                                policy=rl.PolicyEpsGreedy(expl_start=False,
                                                          nb_rand_steps=100000,
                                                          e_rand_start=1.0,
                                                          e_rand_target=0.1,
                                                          e_rand_decay=1 /
                                                          10000))

        else:

            #
            #   Agent - tiles or aggregate
            #
            agent = rl.AgentQ(
                state_space=self.env.observation_space,
                action_space=self.env.action_space,
                discount=0.99,
                q_fun_approx=rl.QFunctTiles(step_size=0.3,
                                            num_tillings=8,
                                            init_val=0),
                # q_fun_approx=rl.AggregateApproximator(
                #     step_size=0.3,
                #     bins=[64, 64],
                #     init_val=0),
                policy=rl.PolicyEpsGreedy(expl_start=False,
                                          nb_rand_steps=0,
                                          e_rand_start=1.0,
                                          e_rand_target=0.1,
                                          e_rand_decay=1 / 10000))

        #
        #   Plotting
        #
        # Need to re-think how plotting works
        if args.plot:
            fig1 = plt.figure()
            #fig2 = plt.figure()
            self.plotter = rl.util.Plotter(
                realtime_plotting=True,
                plot_every=1000,
                disp_len=1000,
                nb_actions=self.env.action_space.n,
                figures=(fig1, ),
                ax_qmax_wf=fig1.add_subplot(2, 4, 1, projection='3d'),
                ax_qmax_im=fig1.add_subplot(2, 4, 2),
                ax_policy=fig1.add_subplot(2, 4, 3),
                ax_trajectory=fig1.add_subplot(2, 4, 4),
                ax_stats=None,
                ax_memory=None,  #fig2.add_subplot(1,1,1),
                ax_q_series=None,
                ax_reward=fig1.add_subplot(2, 1, 2),
            )
            self.plotter.set_state_action_spaces(
                self.env.observation_space.low,
                self.env.observation_space.high,
                h_line=0.0,
                v_line=-0.5)

        #
        #   Logging
        #
        if args.logfile is not None or args.plot:
            self.logger = rl.util.Logger()

            self.logger.agent = rl.util.Log('Agent')
            self.logger.q_val = rl.util.Log('Q_Val')
            self.logger.env = rl.util.Log('Environment')
            self.logger.hist = rl.util.Log('History', 'All sates visited')
            self.logger.memory = rl.util.Log('Memory', 'Full memory dump')
            self.logger.approx = rl.util.Log('Approx', 'Approximator')
            self.logger.epsumm = rl.util.Log('Episodes')

            agent.log_episodes = self.logger.epsumm
            agent.log_hist = self.logger.hist
            if isinstance(agent, rl.AgentDQN):
                agent.memory.install_logger(self.logger.memory, log_every=1000)
            agent.Q.install_logger(self.logger.q_val,
                                   log_every=1000,
                                   samples=(64, 64))

            agent.register_callback('on_step_end', self.on_step_end)

        #
        #   Runner
        #
        try:
            rl.train_agent(env=self.env,
                           agent=agent,
                           total_steps=1000000,
                           target_avg_reward=-200)
        finally:
            if args.logfile is not None:
                logger.save(args.logfile)
                print('Log saved')

        if self.plotter is not None:
            plt.show()