Ejemplo n.º 1
0
def play_w_human():
    g = Game()
    # agent = MinimaxAgent() #MCTSAgent()
    agent = QLearningAgent('q_values')
    turn = RED

    while True:
        g.printBoard()
        if turn == RED:
            row = input(
                '{}\'s turn: '.format('Red' if turn == RED else 'Yellow'))
            w = g.insert(int(row), turn)
            agent.play_opponent_move(int(row))
        else:
            move = agent.play_move()
            w = g.insert(move, turn)
        if w:
            print "WINNER: ", w
            break
        turn = YELLOW if turn == RED else RED
Ejemplo n.º 2
0
def main():
    agent = QLearningAgent()
    if os.path.isfile(FILE):
        with open(FILE, 'r') as f:
            agent.Q = pickle.load(f)
    for i in range(NUM_GAMES):
        print(i)
        agent.train()
    with open(FILE, 'w') as f:
        agent.save(f)
Ejemplo n.º 3
0
        next_obs, reward, done, _ = env.step(action)
        total_reward += reward
        obs = next_obs
        # time.sleep(0.5)
        env.render()
        if done:
            break
    return total_reward

# 使用gym创建迷宫环境,设置is_slippery为False降低环境难度
env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up

# 创建一个agent实例,输入超参数
agent = QLearningAgent(
        obs_n=env.observation_space.n,
        act_n=env.action_space.n,
        learning_rate=0.1,
        gamma=0.9,
        e_greed=0.1)


# 训练500个episode,打印每个episode的分数
for episode in range(500):
    ep_reward, ep_steps = run_episode(env, agent, True)
    print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps, ep_reward))

# 全部训练结束,查看算法效果
test_reward = test_episode(env, agent)

# 保存Q table
if(test_reward == 1):
    agent.save()
Ejemplo n.º 4
0
def main(cfg):
    pygame.init()

    # フォントの作成
    sysfont = pygame.font.SysFont(None, 40)
    screen = pygame.display.set_mode(WINDOW_SIZE)
    pygame.display.set_caption("Grid World")

    done = False

    clock = pygame.time.Clock()

    # grid worldの初期化
    grid_env = GridWorld()  # grid worldの環境の初期化
    ini_state = grid_env.start_pos  # 初期状態(エージェントのスタート地点の位置)
    agent = QLearningAgent(
        epsilon=cfg["agent"]["epsilon"],
        epsilon_decay_rate=cfg["agent"]["epsilon_decay_rate"],
        actions=np.arange(4),
        observation=ini_state)  # Q学習エージェント

    nb_episode = cfg["nb_episode"]  # エピソード数
    save_interval = cfg["save_interval"]
    result_dir = cfg["result_dir"]
    max_step = 1
    rewards = []  # 評価用報酬の保存
    is_end_episode = False  # エージェントがゴールしてるかどうか?

    step = 0
    # time.sleep(30)

    for episode in range(nb_episode):
        print("episode:", episode)
        episode_reward = []  # 1エピソードの累積報酬
        step = 0
        while (is_end_episode is False and step < max_step):  # ゴールするまで続ける
            action = agent.act()  # 行動選択
            state, reward, is_end_episode = grid_env.step(action)
            agent.observe(state, reward)  # 状態と報酬の観測
            episode_reward.append(reward)

            screen.fill(BLACK)
            # grid worldの描画
            draw_grid_world(grid_env.map, screen)
            # テキストを描画したSurfaceを作成
            step_str = sysfont.render("step:{}".format(step), False, WHITE)
            # 位# テキストを描画する
            screen.blit(step_str, (500, 50))
            clock.tick(1)
            step += 1

            # 再描画
            pygame.display.flip()

        rewards.append(np.sum(episode_reward))  # このエピソードの平均報酬を与える
        state = grid_env.reset()  # 初期化
        agent.observe(state)  # エージェントを初期位置に
        is_end_episode = False
        print("step:", step)
        agents = [agent]

        if episode % save_interval == 0:
            save_result(agents, episode, result_dir)

    pygame.quit()
Ejemplo n.º 5
0
from environments.biased_rock_paper_scissors_env import BiasedRockPaperScissorsEnv
from qlearning_agent import QLearningAgent
from session import Session
from collections import Counter

env = BiasedRockPaperScissorsEnv()
agent = QLearningAgent(alpha=0.1, gamma=0.9)
session = Session(env, agent)
logs = session.run(episodes=10000, epsilon='explore_then_exploit')
print(Counter([log['state-action pairs'][0][1] for log in logs]))

# the more randomness in the environment, the lower the alpha should be
Ejemplo n.º 6
0
    # env.render()

    n_states = env.env.nS
    n_actions = env.env.nA

    print('States number = %i, Actions number = %i' % (n_states, n_actions))

    # create q learning agent with
    alpha = 0.5
    epsilon = 0.9
    epsilon_threshold = 0.01
    discount = 0.99
    get_legal_actions = lambda s: range(n_actions)
    epsilon_ratio = 0.995

    ql_agent = QLearningAgent(alpha, epsilon, discount, get_legal_actions)
    sarsa_agent = SarsaAgent(alpha, epsilon, discount, get_legal_actions)
    expected_sarsa_agent = ExpectedValueSarsaAgent(alpha, epsilon, discount,
                                                   get_legal_actions)

    plt.figure(figsize=[10, 4])
    rewards_qlearning = []
    rewards_sarsa = []
    rewards_expected_sarsa = []

    # Testing loop
    n = 1
    r_qlearning = []
    r_sarsa = []
    r_expected_sarsa = []
    for _ in range(n):
Ejemplo n.º 7
0
import random
import numpy as np
import matplotlib.pyplot as plt
from qlearning_agent import QLearningAgent
from policy import EpsGreedyQPolicy
from grid_world import GridWorld

if __name__ == '__main__':
    grid_env = GridWorld() # grid worldの環境の初期化
    ini_state = grid_env.start_pos  # 初期状態(エージェントのスタート地点の位置)
    policy = EpsGreedyQPolicy(epsilon=.01) # 方策の初期化。ここではε-greedy
    agent = QLearningAgent(actions=np.arange(4), observation=ini_state, policy=policy) # Q Learning エージェントの初期化
    nb_episode = 100   #エピソード数
    rewards = []    # 評価用報酬の保存
    is_goal = False # エージェントがゴールしてるかどうか?
    for episode in range(nb_episode):
        episode_reward = [] # 1エピソードの累積報酬
        while(is_goal == False):    # ゴールするまで続ける
            action = agent.act()    # 行動選択
            state, reward, is_goal = grid_env.step(action)
            agent.observe(state, reward)   # 状態と報酬の観測
            episode_reward.append(reward)
        rewards.append(np.sum(episode_reward)) # このエピソードの平均報酬を与える
        state = grid_env.reset()    #  初期化
        agent.observe(state)    # エージェントを初期位置に
        is_goal = False

    # テスト(greedyアクション)
    agent.traning = False
    while(is_goal == False):    # ゴールするまで続ける
        print("(y, x):{}".format(state))
Ejemplo n.º 8
0
import copy
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from qlearning_agent import QLearningAgent
from grid_world import GridWorld

if __name__ == '__main__':
    grid_env = GridWorld()  # grid worldの環境の初期化
    ini_state = grid_env.start_pos  # 初期状態(エージェントのスタート地点の位置)
    agent = QLearningAgent(epsilon=.1,
                           actions=np.arange(4),
                           observation=ini_state)  # Q学習エージェント
    nb_episode = 1000  #エピソード数
    rewards = []  # 評価用報酬の保存
    is_end_episode = False  # エージェントがゴールしてるかどうか?
    for episode in range(nb_episode):
        episode_reward = []  # 1エピソードの累積報酬
        while (is_end_episode == False):  # ゴールするまで続ける
            action = agent.act()  # 行動選択
            state, reward, is_end_episode = grid_env.step(action)
            agent.observe(state, reward)  # 状態と報酬の観測
            episode_reward.append(reward)
        rewards.append(np.sum(episode_reward))  # このエピソードの平均報酬を与える
        state = grid_env.reset()  #  初期化
        agent.observe(state)  # エージェントを初期位置に
        is_end_episode = False

    # 結果のプロット
    plt.plot(np.arange(nb_episode), rewards)
Ejemplo n.º 9
0
from grid_world import GridWorld

# 定数
NB_EPISODE = 100  # エピソード数
EPSILON = .1  # 探索率
ALPHA = .1  # 学習率
GAMMA = .90  # 割引率
ACTIONS = np.arange(4)  # 行動の集合

if __name__ == '__main__':
    grid_env = GridWorld()  # grid worldの環境の初期化
    ini_state = grid_env.start_pos  # 初期状態(エージェントのスタート地点の位置)
    # エージェントの初期化
    agent = QLearningAgent(
        alpha=ALPHA,
        gamma=GAMMA,
        epsilon=EPSILON,  # 探索率
        actions=ACTIONS,  # 行動の集合
        observation=ini_state)  # Q学習エージェント
    rewards = []  # 評価用報酬の保存
    is_end_episode = False  # エージェントがゴールしてるかどうか?

    # 実験
    for episode in range(NB_EPISODE):
        episode_reward = []  # 1エピソードの累積報酬
        while (is_end_episode == False):  # ゴールするまで続ける
            action = agent.act()  # 行動選択
            state, reward, is_end_episode = grid_env.step(action)
            agent.observe(state, reward)  # 状態と報酬の観測
            episode_reward.append(reward)
        rewards.append(np.sum(episode_reward))  # このエピソードの平均報酬を与える
        state = grid_env.reset()  # 初期化
Ejemplo n.º 10
0
		a1 = copy.deepcopy(agent1)
		a2 = copy.deepcopy(agent2)
		if i < n_trials/2:
			winner, count_moves, trial_times = play_wo_human(a1, a2, 1)
		else:
			winner, count_moves, trial_times = play_wo_human(a1, a2, 2)
		if winner:
			results[winner].append(count_moves)
			times[a1.name].append(trial_times[a1.name])
			times[a2.name].append(trial_times[a2.name])

	total_moves = sum(results[a1.name]) + sum(results[a2.name])
	print results
	print "TOTAL GAMES WON BY ", a1.name, ": ", len(results[a1.name])
	if len(results[a1.name]) != 0:
		print "AVERAGE NO. MOVES: ", sum(results[a1.name]) / len(results[a1.name])
	print "AVERAGE TIME PER MOVE: ", sum(times[a1.name]) / total_moves
	print "TOTAL GAMES WON BY ", a2.name, ": ", len(results[a2.name])
	if len(results[a2.name]) != 0:
		print "AVERAGE NO. MOVES: ", sum(results[a2.name])/ len(results[a2.name])
	print "AVERAGE TIME PER MOVE: ", sum(times[a2.name]) / total_moves
	print "################################################################"


if __name__ == "__main__":
	# test_agents(NaiveAgent(), MCTSAgent())
	# test_agents(NaiveAgent(), QLearningAgent())
	# test_agents(MCTSAgent(), MinimaxAgent(depth=3))
	# test_agents(QLearningAgent("q_values"), MinimaxAgent(depth=3))
	test_agents(MCTSAgent(), QLearningAgent("q_values"))