def monte_carlo_demo(): np.random.seed(101) env = SnakeEnv(10, [3, 6]) agent = ModelFreeAgent(env) mc = MonteCarlo(0.5) with timer('Timer Monte Carlo Iter'): mc.monte_carlo_opt(agent, env) print('MonteCarlo:return_pi={}'.format(eval_game(env, agent))) print(agent.pi) np.random.seed(101) env = SnakeEnv(10, [3, 6]) agent2 = TableAgent(env) pi_algo = PolicyIteration() with timer('Timer PolicyIter'): pi_algo.policy_iteration(agent2) print('PolicyIteration:return_pi={}'.format(eval_game(env, agent2))) print(agent2.pi) np.random.seed(101) env = SnakeEnv(10, [3, 6]) agent3 = ModelFreeAgent(env) mc = QLearning(0.5) with timer('Timer Monte Carlo Iter'): mc.q_learning(agent3, env) print('QLearning:return_pi={}'.format(eval_game(env, agent3))) print(agent3.pi)
def policy_iteration_demo1(): env = SnakeEnv(0, [3, 6]) agent = TableAgent(env) pi_algo = PolicyIteration() pi_algo.policy_iteration(agent) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi)
def policy_iteration_demo1(): env = SnakeEnv(0, [3, 6]) #0代表不考虑梯子 agent = TableAgent(env) #表agent pi_algo = PolicyIteration() #策略迭代模型 pi_algo.policy_iteration(agent) #获得新一时刻的状态值函数 print 'return_pi={}'.format(eval_game(env, agent)) print agent.pi
def generalized_iteration_demo(): np.random.seed(0) env = SnakeEnv(10, [3, 6]) agent = TableAgent(env) pi_algo = GeneralizedPolicyIteration() with timer('Timer GeneralizedIter'): pi_algo.generalized_policy_iteration(agent) print('return_pi={}'.format(eval_game(env, agent)))
def monte_carlo_demo2(): env = SnakeEnv(10, [3, 6]) agent = ModelFreeAgent(env) mc = MonteCarlo(0.5) with timer('Timer Monte Carlo Iter'): mc.monte_carlo_opt(agent, env) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi)
def value_iteration_demo(): np.random.seed(0) env = SnakeEnv(10, [3, 6]) agent = TableAgent(env) vi_algo = ValueIteration() vi_algo.value_iteration(agent) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi)
def value_iteration_demo(): np.random.seed(0) env = SnakeEnv(10, [3, 6]) agent = TableAgent(env) pi_algo = ValueIteration() with timer('Timer ValueIter'): pi_algo.value_iteration(agent) print 'return_pi={}'.format(eval_game(env, agent))
def policy_iteration_demo(): np.random.seed(0) env = SnakeEnv(10, [3, 6]) agent = TableAgent(env) pi_algo = PolicyIterationWithTimer() pi_algo.policy_iteration(agent) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi)
def main(): times = [] env = SnakeEnv() for i in range(100): st = time.time() done = False env.reset() score = 0 food = 0 while not done: info = {"Food": (food, (10, 30))} state, reward, done = env.step(get_input(), info=info) score += reward if reward == settings.FOOD_REWARD: food += 1 env.render(sleep=False) # print(reward) if done: et = time.time() times.append(et - st) # quit() break print(1 / (mean(times)), end=" games per second\n") print(1 / (max(times)), end=" slowest games per second\n") print(1 / (min(times)), end=" fastest games per second\n")
def test_easy(): np.random.seed(0) sum_opt = 0 sum_0 = 0 sum_1 = 0 env = SnakeEnv(0, [3, 6]) for i in range(10000): sum_opt += eval_game(env, policy_ref) sum_0 += eval_game(env, policy_0) sum_1 += eval_game(env, policy_1) print('opt avg={}'.format(sum_opt / 10000.0)) print('0 avg={}'.format(sum_0 / 10000.0)) print('1 avg={}'.format(sum_1 / 10000.0))
def policy_iteration_demo2(): env = SnakeEnv(10, [3, 6]) agent = TableAgent(env) agent.pi[:] = 0 print('return3={}'.format(eval_game(env, agent))) agent.pi[:] = 1 print('return6={}'.format(eval_game(env, agent))) agent.pi[97:100] = 0 print('return_ensemble={}'.format(eval_game(env, agent))) pi_algo = PolicyIteration() pi_algo.policy_iteration(agent) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi)
def first_easy(): sum_opt = 0 sum_0 = 0 sum_1 = 0 env = SnakeEnv(0, [3, 6]) countNum = 10000 for i in range(countNum): sum_opt += eval_game(env, policy_ref) sum_0 += eval_game(env, policy_0) sum_1 += eval_game(env, policy_1) print('policy_ref avg={}'.format(sum_opt / countNum)) print('policy_0 avg={}'.format(sum_0 / countNum)) print('policy_1 avg={}'.format(sum_1 / countNum))
def monte_carlo_demo(): env = SnakeEnv(10, [3, 6]) agent = ModelFreeAgent(env) mc = MonteCarlo() with timer('Timer Monte Carlo Iter'): mc.monte_carlo_opt(agent, env) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi) agent2 = TableAgent(env) pi_algo = PolicyIteration() with timer('Timer PolicyIter'): pi_algo.policy_iteration(agent2) print('return_pi={}'.format(eval_game(env, agent2))) print(agent2.pi)
from snake import SnakeEnv import tensorflow as tf import numpy as np from tensorflow import keras from tensorflow.keras import layers import time import pickle import json with open("config.json", 'r') as conf: config = json.load(conf) WIDTH, HEIGHT = config['width'], config['height'] snake = SnakeEnv(width=WIDTH, height=HEIGHT) num_steps = 10**6 FPS = 60 # Configuration parameters for the whole setup seed = 42 gamma = config['gamma'] # Discount factor for past rewards epsilon = config['epsilon'] # Epsilon greedy parameter epsilon_min = config['epsilon_min'] # Minimum epsilon greedy parameter epsilon_max = config['epsilon_max'] # Maximum epsilon greedy parameter epsilon_interval = ( epsilon_max - epsilon_min ) # Rate at which to reduce chance of random action being taken batch_size = config['batch_size'] # Size of batch taken from replay buffer max_steps_per_episode = config['max_steps_per_episode'] # Number of frames to take random action and observe output epsilon_random_frames = config["epsilon_random_frames"] # Number of frames for exploration
# Stats settings GET_STATS = 10 MODEL_SAVE = True # Render ISRENDER = True # For stats ep_rewards = [-200] scores_history = [-100] * 20000 if not os.path.isdir('models-final-check'): os.makedirs('models-final-check') env = SnakeEnv() agent = DeepQAgent(env) for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit="episode"): agent.tensorboard.step = episode #Initialise state,reward. episode_reward = 0 step = 1 current_state = env.reset() done = False while not done: # Using Exploitation vs Exploration ($\epsilon$-greedy strategy) to either choose a random action or a greedy action and pre-process it for further steps. if np.random.random() > epsilon: action = np.argmax(agent.get_qs(current_state)) else: action = np.random.randint(0, env.ACTION_SPACE_SIZE)
def __init__(self, args): self.args = args self.env = SnakeEnv(args.snake_head_x, args.snake_head_y, args.food_x, args.food_y) self.agent = Agent(self.env.get_actions(), args.Ne, args.C, args.gamma)
class Application: def __init__(self, args): self.args = args self.env = SnakeEnv(args.snake_head_x, args.snake_head_y, args.food_x, args.food_y) self.agent = Agent(self.env.get_actions(), args.Ne, args.C, args.gamma) def execute(self): if not self.args.human: if self.args.train_eps != 0: self.train() return self.test() self.show_games() def train(self): print("Train Phase:") self.agent.train() window = self.args.window self.points_results = [] first_eat = True start = time.time() for game in range(1, self.args.train_eps + 1): state = self.env.get_state() dead = False action = self.agent.act(state, 0, dead) while not dead: state, points, dead = self.env.step(action) # For debug convenience, you can check if your Q-table mathches ours for given setting of parameters # (see Debug Convenience part on homework 4 web page) if first_eat and points == 1: self.agent.save_model(utils.CHECKPOINT) first_eat = False action = self.agent.act(state, points, dead) points = self.env.get_points() self.points_results.append(points) if game % self.args.window == 0: print( "Games:", len(self.points_results) - window, "-", len(self.points_results), "Points (Average:", sum(self.points_results[-window:]) / window, "Max:", max(self.points_results[-window:]), "Min:", min(self.points_results[-window:]), ")", ) self.env.reset() print("Training takes", time.time() - start, "seconds") self.agent.save_model(self.args.model_name) def test(self): print("Test Phase:") self.agent.eval() self.agent.load_model(self.args.model_name) points_results = [] start = time.time() for game in range(1, self.args.test_eps + 1): state = self.env.get_state() dead = False action = self.agent.act(state, 0, dead) while not dead: state, points, dead = self.env.step(action) action = self.agent.act(state, points, dead) points = self.env.get_points() points_results.append(points) self.env.reset() print("Testing takes", time.time() - start, "seconds") print("Number of Games:", len(points_results)) print("Average Points:", sum(points_results) / len(points_results)) print("Max Points:", max(points_results)) print("Min Points:", min(points_results)) return sum(points_results) / len(points_results) def show_games(self): print("Display Games") self.env.display() pygame.event.pump() self.agent.eval() points_results = [] end = False for game in range(1, self.args.show_eps + 1): state = self.env.get_state() dead = False action = self.agent.act(state, 0, dead) count = 0 while not dead: count += 1 pygame.event.pump() keys = pygame.key.get_pressed() if keys[K_ESCAPE] or self.check_quit(): end = True break state, points, dead = self.env.step(action) # Qlearning agent if not self.args.human: action = self.agent.act(state, points, dead) # for human player else: print((state[0] + 1) // 40, (state[1] + 1) // 40) for event in pygame.event.get(): if event.type == pygame.KEYDOWN: if event.key == pygame.K_UP: action = 0 elif event.key == pygame.K_DOWN: action = 1 elif event.key == pygame.K_LEFT: action = 2 elif event.key == pygame.K_RIGHT: action = 3 if end: break self.env.reset() points_results.append(points) print("Game:", str(game) + "/" + str(self.args.show_eps), "Points:", points) if len(points_results) == 0: return print("Average Points:", sum(points_results) / len(points_results)) def check_quit(self): for event in pygame.event.get(): if event.type == pygame.QUIT: return True return False
def __init__(self, args): x_train=np.load("q_agent.npy") print(x_train) self.args = args self.env = SnakeEnv(args.snake_head_x, args.snake_head_y, args.food_x, args.food_y) self.agent = Agent(self.env.get_actions(), args.Ne, args.C, args.gamma)
type=int, nargs='?', default=8, help='Zoom per dimension') parser.add_argument('--fps', type=int, nargs='?', default=10, help='Frames per second') tensorize = lambda t: torch.FloatTensor(t.transpose( (2, 0, 1)).copy()).unsqueeze(0) if __name__ == "__main__": args = parser.parse_args() env = SnakeEnv(args.dim, zoom=args.zoom) pyglet.clock.set_fps_limit(args.fps) global a, policy if (args.filename is None): a = np.random.randint(4) from pyglet.window import key def key_press(k, mod): global restart global a if k == key.R: restart = True if k == key.UP: a = 0 if k == key.DOWN: a = 1 if k == key.LEFT: a = 2 if k == key.RIGHT: a = 3