def train(game): agent = DQN(game) for i in tqdm(range(TRAIN_GAMES)): game.new_episode() previous_variables = None previous_img = None done = False local_history = [] total_reward = 0 while not done: state = game.get_state() img = state.screen_buffer variables = state.game_variables if previous_variables is None: previous_variables = variables if previous_img is None: previous_img = img action = agent.act(img) reward = game.make_action(action) done = game.is_episode_finished() reward = (reward + calculate_additional_reward(previous_variables, variables)) / 100 total_reward += reward local_history.append([previous_img, img, reward, action, done]) previous_variables = variables previous_img = img if total_reward >= 0: for previous_state, state, reward, action, done in local_history: agent.remember(previous_state, state, reward, action, done) agent.train()
def run(ep,train=False): pygame.init() loss=[] agent = DQN(3, 5) env=pongGame() weights_filepath = 'PongGame.h5' if train==False: agent.model.load_weights(weights_filepath) print("weights loaded") for e in range(ep): for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() quit() state = env.reset() state = np.reshape(state, (1, 5)) score = 0 max_steps = 1000 for i in range(max_steps): action = agent.act(state) reward, next_state, done = env.step(action) score += reward next_state = np.reshape(next_state, (1, 5)) agent.remember(state, action, reward, next_state, done) state = next_state if train==True: agent.replay() if done: print("episode: {}/{}, score: {}".format(e, ep, score)) break loss.append(score) if train: agent.model.save_weights("PongGame.h5") return loss
def train_dqn(episodes, env, render_frequency=0): now = datetime.datetime.now() id = f'{now.hour}{now.minute}' episode_rewards = [] agent = DQN(env, params) best_score = 0 for episode in range(episodes): rendering = render_frequency and episode % render_frequency == 0 and isinstance( env, HeadlessSnake) state = env.reset( ) # Reset enviroment before each episode to start fresh if rendering: renderer = Renderer(env, episode + 1) env.update_episode(episode + 1) # state = np.reshape(state, (1, env.state_space)) total_reward = 0 max_steps = 10000 for step in range(max_steps): # 1. Find next action using the Epsilon-Greedy exploration Strategy action = agent.get_action(state) # 2. perform action in enviroment next_state, reward, done, _ = env.step(action) total_reward += reward # next_state = np.reshape(next_state, (1, env.state_space)) if rendering: renderer.update() # 3. Update the Q-function (train model) agent.remember(state, action, reward, next_state, done) agent.train_with_experience_replay() # 4. Change exploration vs. explotation probability agent.update_exploration_strategy(episode) state = next_state if done: print( f'episode: {episode+1}/{episodes}, score: {total_reward}, steps: {step}, ' f'epsilon: {agent.epsilon}, highscore: {env.maximum}') save_model(id, agent, best_score, total_reward) break if rendering: renderer.bye() save_model(id, agent, best_score, total_reward) episode_rewards.append(total_reward) return episode_rewards
while True: action = agent.act(pre_ob, step=i) ob, reward, done, _ = env.step(action) if reward <= -1: reward = -1 next_pre_ob = preprocess(ob) # Stack observations next_pre_ob = next_pre_ob.reshape(1, 100, 100) ob_stack = np.insert(ob_stack, -1, next_pre_ob, axis=3) ob_stack = np.delete(ob_stack, 0, axis=3) next_pre_ob = ob_stack agent.remember(pre_ob, action, reward, next_pre_ob, done) agent.replay() pre_ob = next_pre_ob score = score + reward if done: break scores.append(score) print("Episode {} score: {}".format(i + 1, score)) mean_score = np.mean(scores) if (i + 1) % 5 == 0: print( "Episode {}, score: {}, exploration at {}%, mean of last 100 episodes was {}" .format(i + 1, score, agent.epsilon * 100, mean_score))