def random_test(self, iteration, graph_data, template_env: DynamicETC): agents = [] for road in graph_data['edges']: agent = RandomAgent(road['source'], road['target'], const.ACTION_SPACE, const.MAX_TIME_STEP) agents.append(agent) env = template_env.clone() total_average_reward = 0.0 for i in range(iteration): state = env.reset() action_seqs = [] for agent in agents: action_seqs.append(agent.act()) cumulative_reward = 0 for step in range(const.MAX_TIME_STEP): actions = [] for action_seq in action_seqs: actions.append({ 'source': action_seq.get_id()['source'], 'target': action_seq.get_id()['target'], 'toll': action_seq.get_action(step) }) state, reward, terminal, info = env.step(actions) cumulative_reward += reward average_reward = cumulative_reward / const.MAX_TIME_STEP logging.info( "average cumulative reward is {}".format(average_reward)) total_average_reward = total_average_reward * ( i / (i + 1)) + average_reward / (i + 1) logging.info("total average reward is {}".format(total_average_reward))
def _get_agent(agent_type, player): if agent_type == 'Player': return Player() if agent_type == 'RandomPlayer': return RandomAgent() if agent_type == 'QLearn': return QLearn(player, source_name='models/min_max_5_10K_p1_20191111_202358.pkl') if agent_type == 'MinMax': return MinMaxAgent(player, 4, AdvancedScore.score) if agent_type == 'AlphaBeta': return MinMaxAgentWAlphaBeta(player, 6, AdvancedScore.score) if agent_type == 'MonteCarlo': return MonteCarlo(player, 2000) if agent_type == 'AlphaZero': from alpha_zero.alpha_net import AlphaNet alpha_net = AlphaNet('test_net_1_1_2_3') return AlphaZero(player, net=alpha_net, mcts_turns=300, print_policy=True)
lives = 3 while True: action = agent.predict_action(obs) obs, reward, done, info = env.step(action) env.render() time.sleep(sleep) if done: lives -= 1 if done and lives == 0: break env.close() agent_runner = { 'rand': lambda s, e: run_trained(RandomAgent(), s, e), 'dqn': lambda s, e: run_trained(DQNAgent(), s, e), 'a2c': lambda s, e: run_trained(A2CAgent(), s, e), 'ppo': lambda s, e: run_trained(PPOAgent(), s, e), 'dqn_custom': lambda s, e: run_custom_trained(model_path=os.path.join( MODELS_ROOT, "custom_ep_63.pth"), sleep=s, episodes=e), 'dqn_forgetting': lambda s, e: run_custom_trained(model_path=os.path.join( MODELS_ROOT, "custom_ep_89_forgetting.pth"), sleep=s,
def greedy_chaser(env_name, episode_count, load_path, display=True, fps=10): """Show result of trained chaser to chase random agent runner. """ env = gym.make(env_name) chaser = GreedyAgent(default_reward=-1.0, name='chaser', color=(1.0, 0.0, 0.0), env=env, agent_type=AgentType.Chaser, load_path=load_path, features_n=4) runner = RandomAgent(default_reward=1.0, name='runner', color=(0.0, 1.0, 0.0), env=env, agent_type=AgentType.Runner) env.add_agent(chaser) env.add_agent(runner) for epi in range(episode_count): state_map = env.reset() chaser_info = state_map[chaser.name] runner_info = state_map[runner.name] chaser_x = chaser_info['state'][0] chaser_y = chaser_info['state'][1] runner_x = runner_info['state'][0] runner_y = runner_info['state'][1] chaser_state = [chaser_x, chaser_y, runner_x, runner_y] runner_state = [runner_x, runner_y, chaser_x, chaser_y] step = 0 while True: if display: env.render() time.sleep(1 / fps) action = chaser.act(chaser_state) chaser_poi, direction, _, done, _ = env.step(action, chaser.name) chaser_x, chaser_y = chaser_poi[0], chaser_poi[1] chaser_state_ = [chaser_x, chaser_y, runner_x, runner_y] chaser_state = chaser_state_ step += 1 if done: print('Episode: %d\tsteps: %d' % (epi + 1, step + 1)) break else: runner_action = runner.act(runner_state) runner_poi, runner_dir, _, done, _ = env.step( runner_action, runner.name) if display: env.render() time.sleep(1 / fps) runner_x, runner_y = runner_poi[0], runner_poi[1] runner_state = [runner_x, runner_y, chaser_x, chaser_y] step += 1 if done: print('Episode: %d\tsteps: %d' % (epi + 1, step + 1)) break
def chaser_dqn(env_name, episode_count=1000, display=True, fps=10, need_reload=False, reload_path=None): """Use DQN to train Chaser to chase a random runner agent. """ env = gym.make(env_name) # Epsilon Greedy agent with network to make policy chaser = EGreedyAgent(default_reward=-1.0, name='chaser', color=(1.0, 0.0, 0.0), env=env, agent_type=AgentType.Chaser, features_n=4, memory_capacity=1024, need_reload=need_reload, reload_path=reload_path) # Randomly make choice runner = RandomAgent(default_reward=1.0, name='runner', color=(0.0, 1.0, 0.0), env=env, agent_type=AgentType.Runner) env.add_agent(chaser) env.add_agent(runner) reward = 0 done = False # Total steps contains every episode's steps total_steps = 0 for epi in range(episode_count): state_map = env.reset() chaser_info = state_map[chaser.name] runner_info = state_map[runner.name] chaser_x = chaser_info['state'][0] chaser_y = chaser_info['state'][1] runner_x = runner_info['state'][0] runner_y = runner_info['state'][1] # State contains four elements chaser_state = [chaser_x, chaser_y, runner_x, runner_y] runner_state = [runner_x, runner_y, chaser_x, chaser_y] # current episode's steps step = 0 while True: if display: env.render() time.sleep(1 / fps) action = chaser.act(chaser_state) chaser_poi, direction, reward, done, _ = env.step( action, chaser.name) chaser_x, chaser_y = chaser_poi[0], chaser_poi[1] chaser_state_ = [chaser_x, chaser_y, runner_x, runner_y] # Put this transition into memory chaser.memory.push(chaser_state, action, chaser_state_, reward) # Update current state chaser_state = chaser_state_ if total_steps % 10 == 0: # Every 10 steps optimize model runner_loss = chaser.optimize_model() step += 1 total_steps += 1 if done: print('Episode: %d\tsteps: %d\tLoss: %f' % (epi + 1, step + 1, runner_loss)) break else: runner_action = runner.act(runner_state) runner_poi, runner_dir, _, done, _ = env.step( runner_action, runner.name) if display: env.render() time.sleep(1 / fps) runner_x, runner_y = runner_poi[0], runner_poi[1] runner_state = [runner_x, runner_y, chaser_x, chaser_y] step += 1 if done: # 添加 正值 reward 到 集合中 # a = get_reverse_action(runner_action) # s = chaser_state # r = 10 # s_ = [chaser_x, chaser_y, runner_x, runner_y] # chaser.memory.push(s, a, s_, r) print('Episode: %d\tsteps: %d\tLoss: %f' % (epi + 1, step + 1, runner_loss)) break # Update the target network, copying all weights and biases in DQN if total_steps % 500 == 0: # target net params replaced print('Target net params Replaced!') chaser.target_net.load_state_dict( chaser.policy_net.state_dict()) chaser.save('chaser-2000.pkl')
def chaser_qt_greedy(env_name, restore_path, episode_count=1000, display=True, fps=10): env = gym.make(env_name) chaser = QTGreedyAgent(default_reward=-1.0, name='chaser', color=(1.0, 0.0, 0.0), env=env, agent_type=AgentType.Chaser, restore_path=restore_path, n_width=env.n_width, n_height=env.n_height) runner = RandomAgent(default_reward=1.0, name='runner', color=(0.0, 1.0, 0.0), env=env, agent_type=AgentType.Runner) env.add_agent(chaser) env.add_agent(runner) for epi in range(episode_count): state_map = env.reset() chaser_info = state_map[chaser.name] runner_info = state_map[runner.name] chaser_x = chaser_info['state'][0] chaser_y = chaser_info['state'][1] runner_x = runner_info['state'][0] runner_y = runner_info['state'][1] chaser_state = [chaser_x, chaser_y, runner_x, runner_y] runner_state = [runner_x, runner_y, chaser_x, chaser_y] step = 0 while True: if display: env.render() time.sleep(1 / fps) action = chaser.act(chaser_state) chaser_poi, direction, _, done, _ = env.step(action, chaser.name) chaser_x, chaser_y = chaser_poi[0], chaser_poi[1] chaser_state_ = [chaser_x, chaser_y, runner_x, runner_y] chaser_state = chaser_state_ step += 1 if done: print('Episode: %d\tsteps: %d' % (epi + 1, step + 1)) break else: runner_action = runner.act(runner_state) runner_poi, runner_dir, _, done, _ = env.step( runner_action, runner.name) if display: env.render() time.sleep(1 / fps) runner_x, runner_y = runner_poi[0], runner_poi[1] runner_state = [runner_x, runner_y, chaser_x, chaser_y] step += 1 if done: print('Episode: %d\tsteps: %d' % (epi + 1, step + 1)) break
def chaser_q_learning(env_name, episode_count=1000, display=True, fps=10, need_reload=True, reload_path=None): env = gym.make(env_name) chaser = QTEGreedyAgent(default_reward=-0.1, name='chaser', color=(1.0, 0.0, 0.0), env=env, agent_type=AgentType.Chaser, n_width=env.n_width, n_height=env.n_height, need_reload=need_reload, reload_path=reload_path) runner = RandomAgent(default_reward=1.0, name='runner', color=(0.0, 1.0, 0.0), env=env, agent_type=AgentType.Runner) env.add_agent(chaser) env.add_agent(runner) # Total steps contains every episode's steps total_steps = 0 for epi in range(episode_count): state_map = env.reset() chaser_info = state_map[chaser.name] runner_info = state_map[runner.name] chaser_x = chaser_info['state'][0] chaser_y = chaser_info['state'][1] runner_x = runner_info['state'][0] runner_y = runner_info['state'][1] # State contains four elements chaser_state = [chaser_x, chaser_y, runner_x, runner_y] runner_state = [runner_x, runner_y, chaser_x, chaser_y] # current episode's steps step = 0 while True: if display: env.render() time.sleep(1 / fps) action = chaser.act(chaser_state) chaser_poi, direction, reward, done, _ = env.step( action, chaser.name) chaser_x, chaser_y = chaser_poi[0], chaser_poi[1] chaser_state_ = [chaser_x, chaser_y, runner_x, runner_y] chaser.update(chaser_state, action, reward, chaser_state_) # Update current state chaser_state = chaser_state_ step += 1 total_steps += 1 if done: print('Episode: %d\tsteps: %d' % (epi + 1, step + 1)) break else: runner_action = runner.act(runner_state) runner_poi, runner_dir, _, done, _ = env.step( runner_action, runner.name) if display: env.render() time.sleep(1 / fps) runner_x, runner_y = runner_poi[0], runner_poi[1] runner_state = [runner_x, runner_y, chaser_x, chaser_y] step += 1 if done: # 添加 正值 reward 到 集合中 # a = get_reverse_action(runner_action) # s = chaser_state # r = 10 # s_ = [chaser_x, chaser_y, runner_x, runner_y] # chaser.memory.push(s, a, s_, r) print('Episode: %d\tsteps: %d' % (epi + 1, step + 1)) break # Update the target network, copying all weights and biases in DQN if total_steps % 500 == 0: chaser.save()