Beispiel #1
0
    def test_10_run_keras_1(self):
        def on_step_end(agent, reward, observation, done, action):
            """Callback to print stuff to console"""
            if agent.total_step % 1000 == 0:
                print('test_10_run_keras_1', agent.total_step)
            if done:
                print('episode terminated at', agent.total_step)

        env = gym.make('MountainCar-v0').env  # .env to remove 200 step limit
        env.seed(self.seed)

        #
        #   Keras DQN model
        #
        q_model = tf.keras.models.Sequential()
        q_model.add(
            tf.keras.layers.Dense(units=256, activation='relu', input_dim=2))
        q_model.add(tf.keras.layers.Dense(units=256, activation='relu'))
        q_model.add(tf.keras.layers.Dense(units=3, activation='linear'))
        q_model.compile(loss='mse',
                        optimizer=tf.keras.optimizers.RMSprop(lr=0.00025))

        # Configure agent
        agent = rl.Agent(state_space=env.observation_space,
                         action_space=env.action_space,
                         discount=0.99,
                         start_learning_at=22000,
                         memory=rl.Memory(max_len=10000,
                                          batch_size=64,
                                          enable_pmr=False,
                                          initial_pmr_error=1000.0),
                         q_fun_approx=rl.KerasApproximator(model=q_model),
                         policy=rl.QMaxPolicy(expl_start=False,
                                              nb_rand_steps=22000,
                                              e_rand_start=1.0,
                                              e_rand_target=0.1,
                                              e_rand_decay=1 / 10000))

        agent.register_callback('on_step_end', on_step_end)

        # Main train loop
        rl.train_agent(env=env, agent=agent, total_steps=23000)

        # This is used to test for any numerical discrepancy between runs
        fp, ws, st, act, rew, done = agent.get_fingerprint()
        print('FINGERPRINT:', fp)
        print('  wegight sum:', ws)
        print('  st, act, rew, done:', st, act, rew, done)

        self.assertEqual(fp, -8093.627516248174)
        self.assertEqual(ws, 3562.3154466748238)
        self.assertEqual(st, -11822.942962922998)
        self.assertEqual(act, 23165)
        self.assertEqual(rew, -22999.0)
        self.assertEqual(done, 1)
Beispiel #2
0
def with_agent():
    t = ArtificialTime()
    soil = Soil(t)
    policy = rl_agent.Policy(0.1, 0.01, [0, 10, 20])
    agent = rl_agent.Agent(soil, t, policy, 0.7, 0.8, [0, 1, 2])
    while t.month < 2:
        agent.Q_learning_iteration()
        '''
        if agent.learning_iteration % 100 == 0:
            print(soil)
            print(policy)
            print(policy.epsilon)
            input()
        '''
        t.increase_time()
    soil.visualizer('day')
Beispiel #3
0
    def test_20_run_tile_1(self):
        def on_step_end(agent, reward, observation, done, action):
            if agent.total_step % 1000 == 0:
                print('test_20_run_tile_1', agent.total_step)
            if done:
                print('episode terminated at', agent.total_step)

        env = gym.make('MountainCar-v0').env
        env.seed(self.seed)

        agent = rl.Agent(state_space=env.observation_space,
                         action_space=env.action_space,
                         discount=0.99,
                         start_learning_at=0,
                         memory=None,
                         q_fun_approx=rl.TilesApproximator(step_size=0.3,
                                                           num_tillings=8,
                                                           init_val=0),
                         policy=rl.QMaxPolicy(expl_start=False,
                                              nb_rand_steps=0,
                                              e_rand_start=1.0,
                                              e_rand_target=0.1,
                                              e_rand_decay=1 / 10000))

        agent.register_callback('on_step_end', on_step_end)

        rl.train_agent(env=env, agent=agent, total_steps=5000)

        # This is used to test for any numerical discrepancy between runs
        fp, ws, st, act, rew, done = agent.get_fingerprint()
        print('FINGERPRINT:', fp)
        print('  wegight sum:', ws)
        print('  st, act, rew, done:', st, act, rew, done)

        self.assertEqual(fp, -3667.665666738285)
        self.assertEqual(ws, -1297.1708778794816)
        self.assertEqual(st, -2430.494788858803)
        self.assertEqual(act, 5058)
        self.assertEqual(rew, -4999.0)
        self.assertEqual(done, 1)
Beispiel #4
0
    def test_30_run_aggregate_1(self):
        def on_step_end(agent, reward, observation, done, action):
            if agent.total_step % 1000 == 0:
                print('test_30_run_aggregate_1', agent.total_step)
            if done:
                print('episode terminated at', agent.total_step)

        env = gym.make('MountainCar-v0').env
        env.seed(self.seed)

        agent = rl.Agent(state_space=env.observation_space,
                         action_space=env.action_space,
                         discount=0.99,
                         start_learning_at=0,
                         memory=None,
                         q_fun_approx=rl.AggregateApproximator(step_size=0.3,
                                                               bins=[64, 64],
                                                               init_val=0),
                         policy=rl.QMaxPolicy(expl_start=False,
                                              nb_rand_steps=0,
                                              e_rand_start=0.1,
                                              e_rand_target=0.1,
                                              e_rand_decay=1 / 10000))

        agent.register_callback('on_step_end', on_step_end)

        rl.train_agent(env=env, agent=agent, total_steps=30000)

        # This is used to test for any numerical discrepancy between runs
        fp, ws, st, act, rew, done = agent.get_fingerprint()
        print('FINGERPRINT:', fp)
        print('  wegight sum:', ws)
        print('  st, act, rew, done:', st, act, rew, done)

        self.assertEqual(fp, -24059.666698709698)
        self.assertEqual(ws, -8850.374069905585)
        self.assertEqual(st, -15178.292628804113)
        self.assertEqual(act, 29967)
        self.assertEqual(rew, -29999.0)
        self.assertEqual(done, 1)
Beispiel #5
0
def interaction():
    t = ArtificialTime()
    soil = Soil(t)
    policy = rl_agent.Policy(0.1, 0.05, [0, 20])
    agent = rl_agent.Agent(soil, t, policy, 0.1, 0.2, [0, 1, 2])
    last_raw_command = ""
    while t.month < 2:
        raw_command = input()
        if raw_command == "*":
            raw_command = last_raw_command
        command = raw_command.split()
        if command[0] == "state":
            m = float(command[1])
            for s in policy.state_action_values.keys():
                if s.moisture == m:
                    print(str(s) + " :")
                    for action, value in policy.state_action_values[s].items():
                        print("\tintensity " + str(action) + " :" + str(value))
        elif command[0] == "proceed":
            counter = 0
            if len(command) == 3:
                while counter < int(command[2]):
                    if command[1] == "verbose":
                        print("state: " + str(agent.state))
                    agent.Q_learning_iteration()
                    if command[1] == "verbose":
                        print("action: " + str(agent.action_to_take) +
                              " , reward: " + str(agent.reward))
                        print()
                    t.increase_time()
                    counter += 1
                    if t.month >= 2:
                        break
            else:
                print("Invalid command!")
        elif command[0] == "soil":
            print(soil)
        elif command[0] == "epsilon":
            print(policy.epsilon)
        elif command[0] == "iteration":
            if command[1] == "explore":
                print(policy.exploration_iteration)
            if command[1] == "learn":
                print(agent.learning_iteration)
        elif command[0] == "history":
            if command[1] == "explore":
                print(policy.explore_delta_reward_EMA)
            elif command[1] == "exploit":
                print(policy.exploit_delta_reward_EMA)
            elif command[1] == "reward":
                print(policy.reward_EMA)
            else:
                print("Invalid command!")
        elif command[0] == "visualize":
            if len(command) > 1:
                soil.visualizer(command[1])
            else:
                print("Invalid command!")
        elif command[0] == "loss":
            print(soil.LAYERS_WATER_LOSS)
        elif command[0] == "input":
            print(soil.input_water)
        else:
            print("Invalid Command!")
        last_raw_command = raw_command
    soil.visualizer('day')
                or (self.team_disabled[self.teams[0]]
                    and self.get_winning_team() == self.teams[1])
                or (self.team_disabled[self.teams[1]]
                    and self.get_winning_team() == self.teams[0])
                or self.over):
            self.over = True
        else:
            # switch active team
            for t in self.teams:
                if t != self.active_team and not self.team_disabled[t]:
                    self.active_team = t
                    break

        return team, action

    def print_state(self):
        print(self.scores)
        print(self.world)


if __name__ == '__main__':
    agent1 = ai1.Agent(2)
    agent2 = ai1.Agent(2)
    game = Game(5, 2, agent1, agent2)
    while not game.over:
        game.print_state()
        input()
        print(game.turn())
    agent1.shutdown()
    agent2.shutdown()
Beispiel #7
0
    def main(self):

        args = rl.util.parse_common_args()
        rl.util.try_freeze_random_seeds(args.seed, args.reproducible)

        #
        #   Environment
        #
        # Environment outputs 3-tuple: cos(ang), sin(ang), angular-velocity
        # we translate that to 2-tuple: angle [-pi, pi], ang-vel [-8.0, 8.0]
        # so we can plot 2d action space nicely
        #
        # Environment expect continous 1-tuple action representing torque
        # in range [-2.0, 2.0], but our agent outputs categorical action 0-4
        # so we need to tranlate that to torque
        # this is becouse continous actions are not implemented yet
        def obs_trans(obs):
            """Translate from 3d obs space to 2d (for easier plotting)"""
            theta = np.arctan2(obs[1], obs[0])
            vel = obs[2]
            return np.array([theta, vel])

        def act_trans(act):
            """Translate from categorical actions to continous"""
            torques = [-2.0, -0.5, 0.0, 0.5, 2.0]
            return np.array([torques[act]])

        self.env = rl.util.EnvTranslator(env=gym.make('Pendulum-v0'),
                                         observation_space=gym.spaces.Box(
                                             low=np.array([-np.pi, -8.0]),
                                             high=np.array([np.pi, 8.0])),
                                         observation_translator=obs_trans,
                                         action_space=gym.spaces.Discrete(5),
                                         action_translator=act_trans,
                                         reward_translator=None)

        self.env.seed(args.seed)

        #
        #   Agent
        #
        agent = rl.Agent(state_space=self.env.observation_space,
                         action_space=self.env.action_space,
                         discount=0.99,
                         start_learning_at=0,
                         memory=None,
                         q_fun_approx=rl.TilesApproximator(step_size=0.3,
                                                           num_tillings=16,
                                                           init_val=0),
                         policy=rl.QMaxPolicy(expl_start=False,
                                              nb_rand_steps=0,
                                              e_rand_start=0.0,
                                              e_rand_target=0.0,
                                              e_rand_decay=1 / 10000))

        #
        #   Plotting
        #
        # Need to re-think how plotting works
        if args.plot:
            fig1 = plt.figure()
            self.plotter = rl.util.Plotter(
                realtime_plotting=True,
                plot_every=1000,
                disp_len=1000,
                nb_actions=self.env.action_space.n,
                figures=(fig1, ),
                ax_qmax_wf=fig1.add_subplot(2, 4, 1, projection='3d'),
                ax_qmax_im=fig1.add_subplot(2, 4, 2),
                ax_policy=fig1.add_subplot(2, 4, 3),
                ax_trajectory=fig1.add_subplot(2, 4, 4),
                ax_stats=None,
                ax_memory=None,
                ax_q_series=None,
                ax_reward=fig1.add_subplot(2, 1, 2),
            )
            self.plotter.set_state_action_spaces(
                self.env.observation_space.low,
                self.env.observation_space.high,
                h_line=0.0,
                v_line=0.0)

        #
        #   Logging
        #
        if args.logfile is not None or args.plot:
            self.logger = rl.util.Logger()

            self.logger.agent = rl.util.Log('Agent')
            self.logger.q_val = rl.util.Log('Q_Val')
            self.logger.env = rl.util.Log('Environment')
            self.logger.hist = rl.util.Log('History', 'All sates visited')
            self.logger.memory = rl.util.Log('Memory', 'Full memory dump')
            self.logger.approx = rl.util.Log('Approx', 'Approximator')
            self.logger.epsumm = rl.util.Log('Episodes')

            agent.log_episodes = self.logger.epsumm
            agent.log_hist = self.logger.hist
            agent.Q.install_logger(self.logger.q_val,
                                   log_every=1000,
                                   samples=(64, 64))

        #
        #   Callback
        #
        agent.register_callback('on_step_end', self.on_step_end)

        #
        #   Runner
        #
        try:
            rl.train_agent(env=self.env,
                           agent=agent,
                           total_steps=1000000,
                           target_avg_reward=-200)
        finally:
            if args.logfile is not None:
                logger.save(args.logfile)
                print('Log saved')

        if self.plotter is not None:
            plt.show()
    def main(self):

        args = rl.util.parse_common_args()
        rl.util.try_freeze_random_seeds(args.seed, args.reproducible)

        #
        #   Environment
        #
        # .env at the end removes time limit, see:
        # https://stackoverflow.com/questions/42787924/
        # why-is-episode-done-after-200-time-steps-gym-environment-mountaincar
        self.env = gym.make('MountainCar-v0').env

        self.env.seed(args.seed)

        test_dqn = False
        if test_dqn:

            #
            #   Model
            #
            q_model = tf.keras.models.Sequential()
            q_model.add(tf.keras.layers.Dense(256, 'relu', input_dim=2))
            q_model.add(tf.keras.layers.Dense(256, 'relu'))
            q_model.add(tf.keras.layers.Dense(3, 'linear'))
            q_model.compile(loss='mse',
                            optimizer=tf.keras.optimizers.RMSprop(lr=0.00025))

            #
            #   Agent - DQN with memory
            #
            agent = rl.Agent(state_space=self.env.observation_space,
                             action_space=self.env.action_space,
                             discount=0.99,
                             start_learning_at=100000,
                             memory=rl.Memory(max_len=100000,
                                              batch_size=1024,
                                              enable_pmr=False,
                                              initial_pmr_error=1000.0),
                             q_fun_approx=rl.KerasApproximator(q_model),
                             policy=rl.QMaxPolicy(expl_start=False,
                                                  nb_rand_steps=100000,
                                                  e_rand_start=1.0,
                                                  e_rand_target=0.1,
                                                  e_rand_decay=1 / 10000))

        else:

            #
            #   Agent - tiles or aggregate
            #
            agent = rl.Agent(
                state_space=self.env.observation_space,
                action_space=self.env.action_space,
                discount=0.99,
                start_learning_at=0,
                memory=None,
                q_fun_approx=rl.TilesApproximator(step_size=0.3,
                                                  num_tillings=8,
                                                  init_val=0),
                # q_fun_approx=rl.AggregateApproximator(
                #     step_size=0.3,
                #     bins=[64, 64],
                #     init_val=0),
                policy=rl.QMaxPolicy(expl_start=False,
                                     nb_rand_steps=0,
                                     e_rand_start=1.0,
                                     e_rand_target=0.1,
                                     e_rand_decay=1 / 10000))

        #
        #   Plotting
        #
        # Need to re-think how plotting works
        if args.plot:
            fig1 = plt.figure()
            #fig2 = plt.figure()
            self.plotter = rl.util.Plotter(
                realtime_plotting=True,
                plot_every=1000,
                disp_len=1000,
                nb_actions=self.env.action_space.n,
                figures=(fig1, ),
                ax_qmax_wf=fig1.add_subplot(2, 4, 1, projection='3d'),
                ax_qmax_im=fig1.add_subplot(2, 4, 2),
                ax_policy=fig1.add_subplot(2, 4, 3),
                ax_trajectory=fig1.add_subplot(2, 4, 4),
                ax_stats=None,
                ax_memory=None,  #fig2.add_subplot(1,1,1),
                ax_q_series=None,
                ax_reward=fig1.add_subplot(2, 1, 2),
            )
            self.plotter.set_state_action_spaces(
                self.env.observation_space.low,
                self.env.observation_space.high,
                h_line=0.0,
                v_line=-0.5)

        #
        #   Logging
        #
        if args.logfile is not None or args.plot:
            self.logger = rl.util.Logger()

            self.logger.agent = rl.util.Log('Agent')
            self.logger.q_val = rl.util.Log('Q_Val')
            self.logger.env = rl.util.Log('Environment')
            self.logger.hist = rl.util.Log('History', 'All sates visited')
            self.logger.memory = rl.util.Log('Memory', 'Full memory dump')
            self.logger.approx = rl.util.Log('Approx', 'Approximator')
            self.logger.epsumm = rl.util.Log('Episodes')

            agent.log_episodes = self.logger.epsumm
            agent.log_hist = self.logger.hist
            if agent.memory is not None:
                agent.memory.install_logger(self.logger.memory, log_every=1000)
            agent.Q.install_logger(self.logger.q_val,
                                   log_every=1000,
                                   samples=(64, 64))

            agent.register_callback('on_step_end', self.on_step_end)

        #
        #   Runner
        #
        try:
            rl.train_agent(env=self.env,
                           agent=agent,
                           total_steps=1000000,
                           target_avg_reward=-200)
        finally:
            if args.logfile is not None:
                logger.save(args.logfile)
                print('Log saved')

        if self.plotter is not None:
            plt.show()