def memorize(self, state, action, reward, next_state, done): mem = self.memory.get(reward) if mem is None: mem = deque(maxlen=10 * self.batch_size * self.model_instances) mem.append( (self.preprocess_state(state), encode_action(action), reward, self.preprocess_state(next_state), done)) self.memory[reward] = mem self.visits[state, encode_action(action)] += 1
def play_random(board, save_normalized_matrix=True): steps = [] render_board(board) while True: #Exit if needed by pressing red cross for event in pygame.event.get(): if event.type == QUIT: pygame.quit() sys.exit() #Playing randomly r = np.random.RandomState() action = r.choice(list(range(GameEnv.NB_ACTIONS))) #Select a random action moved = board.move(action) matrix = board.normalized_matrix if save_normalized_matrix else board.matrix if moved: print() print(board.matrix) print("SCORE:", board.score, "\tSTEP:", board.n_steps_valid, "\tHIGHEST VALUE:", board.highest_value) steps.append(Step(matrix=matrix, action=action, action_encoded=encode_action(action))) render_board(board) if board.is_gameover(): print("GAME OVER!") return Game(steps=steps, score=board.score, random_seed=board.random_seed, is_gameover=True) clock.tick(5) pygame.display.flip()
def train(self, state, action, reward, next_state): future_reward = reward + self.gamma * np.max(self.Qmean[next_state, :]) encoded_action = encode_action(action) # update mean, sum squared rewards and variance in exact order self.update_mean(state, encoded_action, future_reward) self.update_sum_squared_rewards(state, encoded_action, future_reward) self.update_variance(state, encoded_action) self.visits[state, encoded_action] += 1
def generate_output_states(self, input_state): next_states = [] # Generating next states states using autoencoder for i in range(self.action_dim): ohe_action = encode_action(self.action_dim, i) ohe_action = np.expand_dims(ohe_action, axis=0) predicted_next = self.predict(input_state, ohe_action) predicted_next = (predicted_next[0, :, :, :] * 255.).astype( np.uint8) next_states.append(preprocess_frame_bw_next_state(predicted_next)) return np.stack(next_states, axis=2)
def generate_agent_episodes(args): full_path = ROLLOUT_DIR + '/rollout_' + args.env_name if not os.path.exists(full_path): os.umask(0o000) os.makedirs(full_path) env_name = args.env_name total_episodes = args.total_episodes time_steps = args.time_steps envs_to_generate = [env_name] for current_env_name in envs_to_generate: print("Generating data for env {}".format(current_env_name)) env = gym.make(current_env_name) # Create the environment env.seed(0) # First load the DQN agent and the predictive auto encoder with their weights agent = Agent(gamma=0.99, epsilon=0.0, alpha=0.0001, input_dims=(104, 80, 4), n_actions=env.action_space.n, mem_size=25000, eps_min=0.0, batch_size=32, replace=1000, eps_dec=1e-5, env_name=current_env_name) agent.load_models() predictor = load_predictive_model(current_env_name, env.action_space.n) s = 0 while s < total_episodes: rollout_file = os.path.join(full_path, 'rollout-%d.npz' % s) observation = env.reset() frame_queue = deque(maxlen=4) dqn_queue = deque(maxlen=4) t = 0 next_state_sequence = [] correct_state_sequence = [] total_reward = 0 while t < time_steps: # preprocess frames for predictive model and dqn converted_obs = preprocess_frame(observation) converted_obs_dqn = preprocess_frame_dqn(observation) if t == 0: for i in range(4): frame_queue.append(converted_obs) dqn_queue.append(converted_obs_dqn) else: frame_queue.pop() dqn_queue.pop() frame_queue.appendleft(converted_obs) dqn_queue.appendleft(converted_obs_dqn) observation_states = np.concatenate(frame_queue, axis=2) dqn_states = np.concatenate(dqn_queue, axis=2) next_states = predictor.generate_output_states( np.expand_dims(observation_states, axis=0)) next_state_sequence.append(next_states) action = agent.choose_action(dqn_states) correct_state_sequence.append( encode_action(env.action_space.n, action)) observation, reward, done, info = env.step( action) # Take a random action total_reward += reward t = t + 1 print( "Episode {} finished after {} timesteps with reward {}".format( s, t, total_reward)) np.savez_compressed(rollout_file, next=next_state_sequence, correct=correct_state_sequence) s = s + 1 env.close()
def main(args): env_name = args.env_name total_episodes = args.total_episodes time_steps = args.time_steps informed = args.informed # action_refresh_rate = args.action_refresh_rate if informed: full_path = ROLLOUT_DIR + '/informed_rollout_' + args.env_name else: full_path = ROLLOUT_DIR + '/random_rollout_' + args.env_name if not os.path.exists(full_path): os.umask(0o000) os.makedirs(full_path) envs_to_generate = [env_name] for current_env_name in envs_to_generate: print("Generating data for env {}".format(current_env_name)) env = gym.make(current_env_name) # Create the environment env.seed(0) s = 0 if informed: agent = load_dqn(env) while s < total_episodes: rollout_file = os.path.join(full_path, 'rollout-%d.npz' % s) observation = env.reset() frame_queue = deque(maxlen=4) dqn_queue = deque(maxlen=4) t = 0 obs_sequence = [] action_sequence = [] next_sequence = [] while t < time_steps: # convert image to greyscale, downsize converted_obs = preprocess_frame(observation) if t == 0: for i in range(4): frame_queue.append(converted_obs) else: frame_queue.pop() frame_queue.appendleft(converted_obs) stacked_state = np.concatenate(frame_queue, axis=2) obs_sequence.append(stacked_state) if informed: dqn_obs = preprocess_frame_dqn(observation) if t == 0: for i in range(4): dqn_queue.append(dqn_obs) else: dqn_queue.pop() dqn_queue.appendleft(dqn_obs) stacked = np.concatenate(dqn_queue, axis=2) action = agent.choose_action(stacked) else: action = env.action_space.sample() action_sequence.append( encode_action(env.action_space.n, action)) observation, _, _, _ = env.step(action) # Take a random action t = t + 1 next_sequence.append(preprocess_frame(observation)) print("Episode {} finished after {} timesteps".format(s, t)) np.savez_compressed(rollout_file, obs=obs_sequence, actions=action_sequence, next_frame=next_sequence) s = s + 1 env.close()
def play(board, save_normalized_matrix=True): """ Parameters ---------- board : numpy.array save_normalized_matrix : bool Whether to save normalized (log2 transformed) or original matrix. Returns ------- collections.namedtuple Game with recorded steps. """ steps = [] render_board(board) while True: for event in pygame.event.get(): if event.type == QUIT: pygame.quit() sys.exit() if event.type == pygame.KEYDOWN: if event.key in POSSIBLE_ACTIONS: matrix = board.normalized_matrix if save_normalized_matrix else board.matrix action = POSSIBLE_ACTIONS[event.key] moved = board.move(action) #boolean if moved: print() print(board.matrix) print("SCORE:", board.score, "\tSTEP:", board.n_steps_valid, "\tHIGHEST VALUE:", board.highest_value) steps.append(Step(matrix=matrix, action=action, action_encoded=encode_action(action))) render_board(board) if board.is_gameover(): print("GAME OVER!") return Game(steps=steps, score=board.score, random_seed=board.random_seed, is_gameover=True) else: print("\nCannot move to this direction!") elif event.key == pygame.K_q: screen.fill(BLACK) return Game(steps=steps, random_seed=board.random_seed, is_gameover=False) elif event.key == pygame.K_p: screen.fill(BLACK) return "quit" clock.tick(60) pygame.display.flip()
feed_dict={X: get_state(state, env.observation_space.n)}) # decode actions actions = [decode_action(a) for a in actions] # epsilon-greedy action if np.random.rand(1) < epsilon: actions[0] = env.action_space.sample() # get the new state next_state, reward, done, _ = env.step(actions[0]) # obtain the next Q values by feeding the state through the network predNextQ = sess.run( Qout, feed_dict={X: get_state(next_state, env.observation_space.n)}) targetQ = allQ targetQ[ 0, encode_action(actions[0])] = reward + gamma * np.max(predNextQ) # train the networkd using the target and predicted Q values sess.run(updateModel, feed_dict={ X: get_state(state, env.observation_space.n), nextQ: targetQ }) rewards.append(reward) state = next_state if done and epsilon > 0.01 and np.mean( rewards) > 0 and episode >= 1000: # reduce the chance of random action as we train the model epsilon = 1. / (episode / 50 + 10) performance.append(np.sum(rewards))