def train(): memory = [] Transition = collections.namedtuple( "Transition", ["state", "action", "reward", "next_state"]) model = DeepQNetwork(flags.n_actions, flags.n_features, flags.lr, flags.gamma, flags.epsilon_max, empty_goal_action) loss_his = [] reward_his = [] step_his = [] for ii in range(flags.max_epoch): state = env.reset() reward_all = 0 done = False steps = 0 loss = 0 while not done: action = model.choose_action(state) next_state, reward, done, _ = env.step(action) reward_all += reward steps += 1 if len(memory) > flags.memory_size: memory.pop(0) memory.append(Transition(state, action, reward, next_state)) if len(memory) > flags.batch_size * 2: batch_transition = random.sample(memory, flags.batch_size) batch_state, batch_action, batch_reward, batch_next_state = map( np.array, zip(*batch_transition)) loss = model.train(state=batch_state, action=batch_action, reward=batch_reward, state_=batch_next_state) if (ii + 1) % flags.replace_target_freq == 0: model.replace_target() model.decay_epsilon() state = next_state if loss > 0: loss_his.append(loss) reward_his.append(reward_all) step_his.append(steps) print("epoch=", ii, "/loss=", loss, "/reward_all=", reward_all, "/steps=", steps) return loss_his, reward_his, step_his
def testing_trained_agent(): """Testing Trained Agent""" global frame_size, stack_size with tf.Session() as sess: game = Doom() state_size = list(frame_size) state_size.append(stack_size) no_actions = len(game.actions) learning_rate = 0.0002 deep_Q_network = DeepQNetwork(state_size, no_actions, learning_rate) totalScore = 0 saver = tf.train.Saver() saver.restore(sess, "./models/model.ckpt") game.start_game() for i in range(1): done = False game.restart_episode() img, game_vars = game.get_environment_state() state = frame_stacking(img, True) while not game.is_episode_finished(): Qs = sess.run(deep_Q_network.output, feed_dict={deep_Q_network.inputs: state.reshape((1, *state.shape))}) choice = np.argmax(Qs) action = game.actions[int(choice)] game.take_action(action) done = game.is_episode_finished() score = game.game_environment.get_total_reward() if done: break else: print("else ") next_img, next_game_vars = game.get_environment_state() next_state = frame_stacking(next_img, False) state = next_state score = game.game_environment.get_total_reward() print("Score: ", score) game.close_environment()
def __init__(self, gamma, epsilon, alpha, maxMemSize, epsEnd, replace=25000, actionSpace=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]): self.Q_values = np.zeros([9, 6, 11]) #state size 9 self.GAMMA = gamma self.ALPHA = alpha self.EPSILON = epsilon self.EPS_END = epsEnd self.actionSpace = actionSpace self.memSize = maxMemSize self.steps = 0 self.learn_step_counter = 0 #target network replacement self.memory = [] self.memCntr = 0 self.replace_target_cnt = replace self.Q_eval = DeepQNetwork(alpha=alpha) self.Q_next = DeepQNetwork(alpha=alpha)
class agent(): def __init__(self, gamma, epsilon, alpha, maxMemSize, epsEnd, replace=25000, actionSpace=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]): self.Q_values = np.zeros([9, 6, 11]) #state size 9 self.GAMMA = gamma self.ALPHA = alpha self.EPSILON = epsilon self.EPS_END = epsEnd self.actionSpace = actionSpace self.memSize = maxMemSize self.steps = 0 self.learn_step_counter = 0 #target network replacement self.memory = [] self.memCntr = 0 self.replace_target_cnt = replace self.Q_eval = DeepQNetwork(alpha=alpha) self.Q_next = DeepQNetwork(alpha=alpha) #https://github.com/dennybritz/reinforcement-learning/blob/master/DP/Policy%20Iteration%20Solution.ipynb def storeTransition(self, state, action, reward, nextState): if self.memCntr < self.memSize: self.memory.append([state, action, reward, nextState]) else: self.memory[self.memCntr % self.memSize] = [state, action, reward, nextState] self.memCntr += 1 def load_memory(self, batch_size): if self.memCntr + batch_size < self.memSize: memStart = int(np.random.choice((range(self.memCntr)))) else: memStart = int(np.random.choice(range(self.memCntr - batch_size))) minibatch = self.memory[memStart:memStart + batch_size] resCurrent = np.zeros([batch_size, 54]) resNext = np.zeros([batch_size, 54]) rewards = np.zeros([batch_size]) i = 0 for state, action, reward, nextState in minibatch: resCurrent[i, :] = state resNext[i, :] = nextState rewards[i] = reward i += 1 return resCurrent, resNext, rewards def chooseAction(self, observation): rand = np.random.random() actions = self.Q_eval.forward(observation) if rand < 1 - self.EPSILON: action = T.argmax(actions) else: action = np.random.choice(self.actionSpace) self.steps += 1 return action def learn(self, batch_size): self.Q_eval.optimizer.zero_grad() if self.replace_target_cnt is not None and\ self.learn_step_counter % self.replace_target_cnt ==0: self.Q_next.load_state_dict(self.Q_eval.state_dict()) Qpred, Qnext, rewards = self.load_memory(batch_size) Qpred = self.Q_eval.forward(Qpred) Qnext = self.Q_next.forward(Qnext) maxA = T.argmax(Qnext, dim=1).cuda() rewards = T.Tensor(rewards).cuda() Qtarget = Qpred Qtarget[:, maxA] = rewards + self.GAMMA * T.max(Qnext[1]) if self.steps > 500: self.EPSILON = np.max([self.EPS_END, self.EPSILON - 1e-4]) loss = self.Q_eval.loss(Qtarget, Qpred).cuda() loss.backward() self.Q_eval.optimizer.step() self.learn_step_counter += 1
def main(): global frame_size, stack_size state_size = list(frame_size) state_size.append(stack_size) game = Doom() no_actions = len(game.actions) learning_rate = 0.002 no_episodes = 500 max_steps = 100 batch_size = 32 explore_max = 1. explore_min = 0.01 decay_rate = 0.00001 gamma = 0.95 pretrain_length = batch_size memory_size = 1000000 training = True episode_render = True tf.reset_default_graph() deep_Q_network = DeepQNetwork(state_size, no_actions, learning_rate) memory = Memory(max_size=memory_size) game.start_game() game.restart_episode() for i in range(pretrain_length): if i == 0: img, game_vars = game.get_environment_state() state = frame_stacking(img, True) action = random.choice(game.actions) reward = game.take_action(action) done = game.is_episode_finished() if done: next_state = np.zeros(state.shape) memory.add((state, action, reward, next_state, done)) game.restart_episode() img, game_vars = game.get_environment_state() state = frame_stacking(img, True) else: next_img, next_game_vars = game.get_environment_state() next_state = frame_stacking(img, False) memory.add((state, action, reward, next_state, done)) state = next_state writer = tf.summary.FileWriter("./tensorboard/dqn/1") tf.summary.scalar("Loss", deep_Q_network.loss) write_op = tf.summary.merge_all() """Prediction """ def predict_action(curr_decay_step, curr_state): exp_exp_tradeoff = np.random.rand() curr_explore_prob = explore_min + ((explore_max - explore_min) * np.exp(-decay_rate * curr_decay_step)) if curr_explore_prob > exp_exp_tradeoff: curr_action = random.choice(game.actions) else: Qs = sess.run(deep_Q_network.output, feed_dict={deep_Q_network.inputs: curr_state.reshape((1, *curr_state.shape))}) choice = np.argmax(Qs) curr_action = game.actions[choice] return curr_action, curr_explore_prob """Training Agent""" saver = tf.train.Saver() if training: with tf.Session() as sess: sess.run(tf.global_variables_initializer()) decay_step = 0 game.start_game() for episode in range(no_episodes): step = 0 episode_rewards = [] game.restart_episode() img, game_vars = game.get_environment_state() state = frame_stacking(img, True) while step < max_steps: step += 1 decay_step += 1 action, explore_prob = predict_action(decay_step, state) reward = game.take_action(action) done = game.is_episode_finished() episode_rewards.append(reward) if done: next_img = np.zeros(frame_size, dtype=np.int) next_state = frame_stacking(next_img, False) step = max_steps total_rewards = np.sum(episode_rewards) print("Episode No. {}".format(episode), "Total reward: {}".format(total_rewards), "Training Loss: {:.4f}".format(loss_val), "Explore Prob: {:.4f}".format(explore_prob)) memory.add((state, action, reward, next_state, done)) else: next_img, next_game_vars = game.get_environment_state() next_state = frame_stacking(next_img, False) memory.add((state, action, reward, next_state, done)) state = next_state """Learning Part """ """Get mini-batches from memory and train""" batch = memory.sample(batch_size) states_mb = [] actions_mb = [] rewards_mb = [] next_states_mb = [] dones_mb = [] for each in batch: states_mb.append(each[0]) actions_mb.append(each[1]) rewards_mb.append(each[2]) next_states_mb.append(each[3]) dones_mb.append(each[4]) states_mb = np.array(states_mb) actions_mb = np.array(actions_mb) rewards_mb = np.array(rewards_mb) next_states_mb = np.array(next_states_mb) dones_mb = np.array(dones_mb) target_Qs_batch = [] Qs_next_state = sess.run(deep_Q_network.output, feed_dict={deep_Q_network.inputs: next_states_mb}) for i in range(0, len(batch)): terminal = dones_mb[i] if terminal: target_Qs_batch.append(rewards_mb[i]) else: target = rewards_mb[i] + (gamma * np.max(Qs_next_state[i])) target_Qs_batch.append(target) targets_mb = np.array(target_Qs_batch) loss_val, _ = sess.run([deep_Q_network.loss, deep_Q_network.optimizer], feed_dict={deep_Q_network.inputs: states_mb, deep_Q_network.target_Q: targets_mb, deep_Q_network.actions: actions_mb}) summary = sess.run(write_op, feed_dict={deep_Q_network.inputs: states_mb, deep_Q_network.target_Q: targets_mb, deep_Q_network.actions: actions_mb}) writer.add_summary(summary, episode) writer.flush() if episode % 5 == 0: save_path = saver.save(sess, "./models/model.ckpt") print("Model Saved")
step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": # maze game env = Maze() # RL = DeepQNetwork(env.n_actions, env.n_features, # learning_rate=0.01, # reward_decay=0.9, # e_greedy=0.9, # replace_target_iter=200, # memory_size=2000 # ) # param tuning by hand, best version for now RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.005, reward_decay=0.8, e_greedy=0.8, replace_target_iter=200, memory_size=2000) env.after(100, run_maze) env.mainloop() RL.plot_cost() exit()
RL.learn() # swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": # maze game env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, lr=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) env.after(100, run_maze) env.mainloop() RL.plot_cost()
print('episode: {}, Reward: {}'.format(episode, Reward)) break def _eval(): for episode in range(10): obs = env.reset() Reward = 0 while True: env.render() action = RL.choose_action(obs, True) obs, reward, done, _ = env.step(action) Reward += reward if done: print('Reward: {}'.format(Reward)) break if __name__ == '__main__': env = gym.make('CartPole-v0') RL = DeepQNetwork(env.observation_space.shape[0], env.action_space.n) train() _eval()
def main(): env = gym.make('SpaceInvaders-v0') memory = deque(maxlen=MEM_SIZE) # fill memory with random interactions with the environment while len(memory) < MEM_SIZE: observation = env.reset() frames = deque([np.zeros((185, 95)) for _ in range(STACK_SIZE)], maxlen=STACK_SIZE) frames.append(preprocess(observation)) state = stack_frames(frames) done = False while not done: # 0 no action, 1 fire, 2 move right, 3 move left, 4 move right fire, 5 move left fire action = env.action_space.sample() observation_, reward, done, info = env.step(action) frames.append(preprocess(observation_)) state_ = stack_frames(frames) memory = store_transition(memory, state, action, reward, state_) state = state_ print('done initializing memory') init_Q, pred_Q = DeepQNetwork() # two separate Q-Table approximations (eval and next) # initialize parameters, not committing to a batch size (NHWC) # we choose 3 channels as we want to pass stacks of 4 consecutive frames in_shape = (-1, 185, 95, STACK_SIZE) if LOAD: path = os.path.join(WEIGHTS_PATH, "params_Q_eval.npy") params_Q_eval = load_params(path) else: _, params_Q_eval = init_Q(in_shape) params_Q_next = params_Q_eval.copy() # Initialize RMSProp optimizer opt_init, opt_update = optimizers.rmsprop(ALPHA) opt_state = opt_init(params_Q_eval) opt_step = 0 # Define a simple mean-squared-error loss def loss(params, batch): inputs, targets = batch predictions = pred_Q(params, inputs) return np.mean((predictions - targets) ** 2) # Define a compiled update step @jit def step(j, opt_state, batch): params = optimizers.get_params(opt_state) g = grad(loss)(params, batch) return opt_update(j, g, opt_state) def learn(opt_step, opt_state, params_Q_eval, params_Q_next): mini_batch = sample(memory, BATCH_SIZE) if opt_step % TAU == 0: params_Q_next = params_Q_eval.copy() input_states = np.stack([transition[0] for transition in mini_batch]) next_states = np.stack([transition[3] for transition in mini_batch]) predicted_Q = pred_Q(params_Q_eval, input_states) predicted_Q_next = pred_Q(params_Q_next, next_states) max_action = np.argmax(predicted_Q_next, axis=1) rewards = np.array([transition[2] for transition in mini_batch]) Q_target = onp.array(predicted_Q) Q_target[:, max_action] = rewards + GAMMA * np.max(predicted_Q_next, axis=1) opt_state = step(opt_step, opt_state, (input_states, Q_target)) params_Q_eval = optimizers.get_params(opt_state) return opt_state, params_Q_eval, params_Q_next scores = [] eps_history = [] eps = EPS_START if LEARN else 0 for i in range(NUM_GAMES): print('starting game ', i + 1, 'epsilon: %.4f' % eps) eps_history.append(eps) done = False observation = env.reset() frames = deque([np.zeros((185, 95)) for _ in range(STACK_SIZE)], maxlen=STACK_SIZE) frames.append(preprocess(observation)) state = stack_frames(frames) score = 0 while not done: action = choose_action(env, state.reshape((1, 185, 95, STACK_SIZE)), pred_Q, params_Q_eval, eps) observation_, reward, done, info = env.step(action) score += reward if RENDER: env.render() if LEARN: frames.append(preprocess(observation)) state_ = stack_frames(frames) memory = store_transition(memory, state, action, reward, state_) state = state_ opt_state, params_Q_eval, params_Q_next = learn(opt_step, opt_state, params_Q_eval, params_Q_next) opt_step += 1 if opt_step > 500: if eps - 1e-4 > EPS_END: eps -= 1e-4 else: eps = EPS_END if LEARN: out_path = os.path.join(WEIGHTS_PATH, 'params_Q_eval_' + str(i)) onp.save(out_path, params_Q_eval) scores.append(score) print('score: ', score)