def test_10_run_keras_1(self): def on_step_end(agent, reward, observation, done, action): """Callback to print stuff to console""" if agent.total_step % 1000 == 0: print('test_10_run_keras_1', agent.total_step) if done: print('episode terminated at', agent.total_step) env = gym.make('MountainCar-v0').env # .env to remove 200 step limit env.seed(self.seed) # # Keras DQN model # q_model = tf.keras.models.Sequential() q_model.add( tf.keras.layers.Dense(units=256, activation='relu', input_dim=2)) q_model.add(tf.keras.layers.Dense(units=256, activation='relu')) q_model.add(tf.keras.layers.Dense(units=3, activation='linear')) q_model.compile(loss='mse', optimizer=tf.keras.optimizers.RMSprop(lr=0.00025)) # Configure agent agent = rl.Agent(state_space=env.observation_space, action_space=env.action_space, discount=0.99, start_learning_at=22000, memory=rl.Memory(max_len=10000, batch_size=64, enable_pmr=False, initial_pmr_error=1000.0), q_fun_approx=rl.KerasApproximator(model=q_model), policy=rl.QMaxPolicy(expl_start=False, nb_rand_steps=22000, e_rand_start=1.0, e_rand_target=0.1, e_rand_decay=1 / 10000)) agent.register_callback('on_step_end', on_step_end) # Main train loop rl.train_agent(env=env, agent=agent, total_steps=23000) # This is used to test for any numerical discrepancy between runs fp, ws, st, act, rew, done = agent.get_fingerprint() print('FINGERPRINT:', fp) print(' wegight sum:', ws) print(' st, act, rew, done:', st, act, rew, done) self.assertEqual(fp, -8093.627516248174) self.assertEqual(ws, 3562.3154466748238) self.assertEqual(st, -11822.942962922998) self.assertEqual(act, 23165) self.assertEqual(rew, -22999.0) self.assertEqual(done, 1)
def with_agent(): t = ArtificialTime() soil = Soil(t) policy = rl_agent.Policy(0.1, 0.01, [0, 10, 20]) agent = rl_agent.Agent(soil, t, policy, 0.7, 0.8, [0, 1, 2]) while t.month < 2: agent.Q_learning_iteration() ''' if agent.learning_iteration % 100 == 0: print(soil) print(policy) print(policy.epsilon) input() ''' t.increase_time() soil.visualizer('day')
def test_20_run_tile_1(self): def on_step_end(agent, reward, observation, done, action): if agent.total_step % 1000 == 0: print('test_20_run_tile_1', agent.total_step) if done: print('episode terminated at', agent.total_step) env = gym.make('MountainCar-v0').env env.seed(self.seed) agent = rl.Agent(state_space=env.observation_space, action_space=env.action_space, discount=0.99, start_learning_at=0, memory=None, q_fun_approx=rl.TilesApproximator(step_size=0.3, num_tillings=8, init_val=0), policy=rl.QMaxPolicy(expl_start=False, nb_rand_steps=0, e_rand_start=1.0, e_rand_target=0.1, e_rand_decay=1 / 10000)) agent.register_callback('on_step_end', on_step_end) rl.train_agent(env=env, agent=agent, total_steps=5000) # This is used to test for any numerical discrepancy between runs fp, ws, st, act, rew, done = agent.get_fingerprint() print('FINGERPRINT:', fp) print(' wegight sum:', ws) print(' st, act, rew, done:', st, act, rew, done) self.assertEqual(fp, -3667.665666738285) self.assertEqual(ws, -1297.1708778794816) self.assertEqual(st, -2430.494788858803) self.assertEqual(act, 5058) self.assertEqual(rew, -4999.0) self.assertEqual(done, 1)
def test_30_run_aggregate_1(self): def on_step_end(agent, reward, observation, done, action): if agent.total_step % 1000 == 0: print('test_30_run_aggregate_1', agent.total_step) if done: print('episode terminated at', agent.total_step) env = gym.make('MountainCar-v0').env env.seed(self.seed) agent = rl.Agent(state_space=env.observation_space, action_space=env.action_space, discount=0.99, start_learning_at=0, memory=None, q_fun_approx=rl.AggregateApproximator(step_size=0.3, bins=[64, 64], init_val=0), policy=rl.QMaxPolicy(expl_start=False, nb_rand_steps=0, e_rand_start=0.1, e_rand_target=0.1, e_rand_decay=1 / 10000)) agent.register_callback('on_step_end', on_step_end) rl.train_agent(env=env, agent=agent, total_steps=30000) # This is used to test for any numerical discrepancy between runs fp, ws, st, act, rew, done = agent.get_fingerprint() print('FINGERPRINT:', fp) print(' wegight sum:', ws) print(' st, act, rew, done:', st, act, rew, done) self.assertEqual(fp, -24059.666698709698) self.assertEqual(ws, -8850.374069905585) self.assertEqual(st, -15178.292628804113) self.assertEqual(act, 29967) self.assertEqual(rew, -29999.0) self.assertEqual(done, 1)
def interaction(): t = ArtificialTime() soil = Soil(t) policy = rl_agent.Policy(0.1, 0.05, [0, 20]) agent = rl_agent.Agent(soil, t, policy, 0.1, 0.2, [0, 1, 2]) last_raw_command = "" while t.month < 2: raw_command = input() if raw_command == "*": raw_command = last_raw_command command = raw_command.split() if command[0] == "state": m = float(command[1]) for s in policy.state_action_values.keys(): if s.moisture == m: print(str(s) + " :") for action, value in policy.state_action_values[s].items(): print("\tintensity " + str(action) + " :" + str(value)) elif command[0] == "proceed": counter = 0 if len(command) == 3: while counter < int(command[2]): if command[1] == "verbose": print("state: " + str(agent.state)) agent.Q_learning_iteration() if command[1] == "verbose": print("action: " + str(agent.action_to_take) + " , reward: " + str(agent.reward)) print() t.increase_time() counter += 1 if t.month >= 2: break else: print("Invalid command!") elif command[0] == "soil": print(soil) elif command[0] == "epsilon": print(policy.epsilon) elif command[0] == "iteration": if command[1] == "explore": print(policy.exploration_iteration) if command[1] == "learn": print(agent.learning_iteration) elif command[0] == "history": if command[1] == "explore": print(policy.explore_delta_reward_EMA) elif command[1] == "exploit": print(policy.exploit_delta_reward_EMA) elif command[1] == "reward": print(policy.reward_EMA) else: print("Invalid command!") elif command[0] == "visualize": if len(command) > 1: soil.visualizer(command[1]) else: print("Invalid command!") elif command[0] == "loss": print(soil.LAYERS_WATER_LOSS) elif command[0] == "input": print(soil.input_water) else: print("Invalid Command!") last_raw_command = raw_command soil.visualizer('day')
or (self.team_disabled[self.teams[0]] and self.get_winning_team() == self.teams[1]) or (self.team_disabled[self.teams[1]] and self.get_winning_team() == self.teams[0]) or self.over): self.over = True else: # switch active team for t in self.teams: if t != self.active_team and not self.team_disabled[t]: self.active_team = t break return team, action def print_state(self): print(self.scores) print(self.world) if __name__ == '__main__': agent1 = ai1.Agent(2) agent2 = ai1.Agent(2) game = Game(5, 2, agent1, agent2) while not game.over: game.print_state() input() print(game.turn()) agent1.shutdown() agent2.shutdown()
def main(self): args = rl.util.parse_common_args() rl.util.try_freeze_random_seeds(args.seed, args.reproducible) # # Environment # # Environment outputs 3-tuple: cos(ang), sin(ang), angular-velocity # we translate that to 2-tuple: angle [-pi, pi], ang-vel [-8.0, 8.0] # so we can plot 2d action space nicely # # Environment expect continous 1-tuple action representing torque # in range [-2.0, 2.0], but our agent outputs categorical action 0-4 # so we need to tranlate that to torque # this is becouse continous actions are not implemented yet def obs_trans(obs): """Translate from 3d obs space to 2d (for easier plotting)""" theta = np.arctan2(obs[1], obs[0]) vel = obs[2] return np.array([theta, vel]) def act_trans(act): """Translate from categorical actions to continous""" torques = [-2.0, -0.5, 0.0, 0.5, 2.0] return np.array([torques[act]]) self.env = rl.util.EnvTranslator(env=gym.make('Pendulum-v0'), observation_space=gym.spaces.Box( low=np.array([-np.pi, -8.0]), high=np.array([np.pi, 8.0])), observation_translator=obs_trans, action_space=gym.spaces.Discrete(5), action_translator=act_trans, reward_translator=None) self.env.seed(args.seed) # # Agent # agent = rl.Agent(state_space=self.env.observation_space, action_space=self.env.action_space, discount=0.99, start_learning_at=0, memory=None, q_fun_approx=rl.TilesApproximator(step_size=0.3, num_tillings=16, init_val=0), policy=rl.QMaxPolicy(expl_start=False, nb_rand_steps=0, e_rand_start=0.0, e_rand_target=0.0, e_rand_decay=1 / 10000)) # # Plotting # # Need to re-think how plotting works if args.plot: fig1 = plt.figure() self.plotter = rl.util.Plotter( realtime_plotting=True, plot_every=1000, disp_len=1000, nb_actions=self.env.action_space.n, figures=(fig1, ), ax_qmax_wf=fig1.add_subplot(2, 4, 1, projection='3d'), ax_qmax_im=fig1.add_subplot(2, 4, 2), ax_policy=fig1.add_subplot(2, 4, 3), ax_trajectory=fig1.add_subplot(2, 4, 4), ax_stats=None, ax_memory=None, ax_q_series=None, ax_reward=fig1.add_subplot(2, 1, 2), ) self.plotter.set_state_action_spaces( self.env.observation_space.low, self.env.observation_space.high, h_line=0.0, v_line=0.0) # # Logging # if args.logfile is not None or args.plot: self.logger = rl.util.Logger() self.logger.agent = rl.util.Log('Agent') self.logger.q_val = rl.util.Log('Q_Val') self.logger.env = rl.util.Log('Environment') self.logger.hist = rl.util.Log('History', 'All sates visited') self.logger.memory = rl.util.Log('Memory', 'Full memory dump') self.logger.approx = rl.util.Log('Approx', 'Approximator') self.logger.epsumm = rl.util.Log('Episodes') agent.log_episodes = self.logger.epsumm agent.log_hist = self.logger.hist agent.Q.install_logger(self.logger.q_val, log_every=1000, samples=(64, 64)) # # Callback # agent.register_callback('on_step_end', self.on_step_end) # # Runner # try: rl.train_agent(env=self.env, agent=agent, total_steps=1000000, target_avg_reward=-200) finally: if args.logfile is not None: logger.save(args.logfile) print('Log saved') if self.plotter is not None: plt.show()
def main(self): args = rl.util.parse_common_args() rl.util.try_freeze_random_seeds(args.seed, args.reproducible) # # Environment # # .env at the end removes time limit, see: # https://stackoverflow.com/questions/42787924/ # why-is-episode-done-after-200-time-steps-gym-environment-mountaincar self.env = gym.make('MountainCar-v0').env self.env.seed(args.seed) test_dqn = False if test_dqn: # # Model # q_model = tf.keras.models.Sequential() q_model.add(tf.keras.layers.Dense(256, 'relu', input_dim=2)) q_model.add(tf.keras.layers.Dense(256, 'relu')) q_model.add(tf.keras.layers.Dense(3, 'linear')) q_model.compile(loss='mse', optimizer=tf.keras.optimizers.RMSprop(lr=0.00025)) # # Agent - DQN with memory # agent = rl.Agent(state_space=self.env.observation_space, action_space=self.env.action_space, discount=0.99, start_learning_at=100000, memory=rl.Memory(max_len=100000, batch_size=1024, enable_pmr=False, initial_pmr_error=1000.0), q_fun_approx=rl.KerasApproximator(q_model), policy=rl.QMaxPolicy(expl_start=False, nb_rand_steps=100000, e_rand_start=1.0, e_rand_target=0.1, e_rand_decay=1 / 10000)) else: # # Agent - tiles or aggregate # agent = rl.Agent( state_space=self.env.observation_space, action_space=self.env.action_space, discount=0.99, start_learning_at=0, memory=None, q_fun_approx=rl.TilesApproximator(step_size=0.3, num_tillings=8, init_val=0), # q_fun_approx=rl.AggregateApproximator( # step_size=0.3, # bins=[64, 64], # init_val=0), policy=rl.QMaxPolicy(expl_start=False, nb_rand_steps=0, e_rand_start=1.0, e_rand_target=0.1, e_rand_decay=1 / 10000)) # # Plotting # # Need to re-think how plotting works if args.plot: fig1 = plt.figure() #fig2 = plt.figure() self.plotter = rl.util.Plotter( realtime_plotting=True, plot_every=1000, disp_len=1000, nb_actions=self.env.action_space.n, figures=(fig1, ), ax_qmax_wf=fig1.add_subplot(2, 4, 1, projection='3d'), ax_qmax_im=fig1.add_subplot(2, 4, 2), ax_policy=fig1.add_subplot(2, 4, 3), ax_trajectory=fig1.add_subplot(2, 4, 4), ax_stats=None, ax_memory=None, #fig2.add_subplot(1,1,1), ax_q_series=None, ax_reward=fig1.add_subplot(2, 1, 2), ) self.plotter.set_state_action_spaces( self.env.observation_space.low, self.env.observation_space.high, h_line=0.0, v_line=-0.5) # # Logging # if args.logfile is not None or args.plot: self.logger = rl.util.Logger() self.logger.agent = rl.util.Log('Agent') self.logger.q_val = rl.util.Log('Q_Val') self.logger.env = rl.util.Log('Environment') self.logger.hist = rl.util.Log('History', 'All sates visited') self.logger.memory = rl.util.Log('Memory', 'Full memory dump') self.logger.approx = rl.util.Log('Approx', 'Approximator') self.logger.epsumm = rl.util.Log('Episodes') agent.log_episodes = self.logger.epsumm agent.log_hist = self.logger.hist if agent.memory is not None: agent.memory.install_logger(self.logger.memory, log_every=1000) agent.Q.install_logger(self.logger.q_val, log_every=1000, samples=(64, 64)) agent.register_callback('on_step_end', self.on_step_end) # # Runner # try: rl.train_agent(env=self.env, agent=agent, total_steps=1000000, target_avg_reward=-200) finally: if args.logfile is not None: logger.save(args.logfile) print('Log saved') if self.plotter is not None: plt.show()