def main(): MAX_BUFFER_SIZE = 100000 MAX_EPISODES = 10000 TRAIN_EPISODE = 100 TARGET_UPDATE_EPS = 1000 batch_size = 32 n_size = 84 discount = 0.99 checkpoint_dir = './checkpoints' save_file_name = 'mario_weight.ckpt' # 1. Create gym environment env = gym.make("ppaquette/SuperMarioBros-1-1-v0") # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) sess = tf.Session() targetDQN = DQN(sess, name="target") dqn_var_list = targetDQN.var_list sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(var_list=dqn_var_list) saver.restore(sess, os.path.join(checkpoint_dir, save_file_name)) for eps in range(MAX_EPISODES): done = False step_count = 0 state = env.reset() state_queue = deque(maxlen=4) state_queue.append(state) while not done: step_count += 1 # cumulate 4 frames if step_count < 4: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) state_queue.append(next_state) continue action = np.argmax( targetDQN.predict( np.reshape(np.array(state_queue), [1, n_size, n_size, 4]))) # Get new state and reward from environment next_state, reward, done, _ = env.step(action) state_queue.append(next_state)
def main(): FLAGS(sys.argv) # 1. Create gym environment env = gym.make(FLAGS.env) # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) if (FLAGS.algorithm == "deepq"): act = deepq.load("models/deepq/%s" % FLAGS.file) nstack = 4 nh, nw, nc = env.observation_space.shape history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8) while True: obs, done = env.reset(), False history = update_history(history, obs) episode_rew = 0 while not done: env.render() action = act(history)[0] obs, rew, done, _ = env.step(action) history = update_history(history, obs) episode_rew += rew print("action : %s reward : %s" % (action, rew)) print("Episode reward", episode_rew) elif (FLAGS.algorithm == "acktr"): policy_fn = CnnPolicy model = acktr_disc.load(policy_fn, env, seed=0, total_timesteps=1, nprocs=4, filename="models/acktr/%s" % FLAGS.file) nstack = 4 nh, nw, nc = env.observation_space.shape history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8) while True: obs, done = env.reset(), False history = update_history(history, obs) episode_rew = 0 while not done: env.render() action = model.step(history)[0][0] obs, rew, done, _ = env.step(action) history = update_history(history, obs) episode_rew += rew print("action : %s reward : %s" % (action, rew)) print("Episode reward", episode_rew)
def main(): FLAGS(sys.argv) # Choose which RL algorithm to train. print("env : %s" % FLAGS.env) # 1. Create gym environment env = gym.make(FLAGS.env) # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) agent = RandomAgent(env.action_space) episode_count = 100 reward = 0 done = False for i in range(episode_count): ob = env.reset() while True: action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) if done: break
def main(): # 1. Create gym environment env = gym.make("ppaquette/SuperMarioBros-1-1-v0") # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) agent = RandomAgent(env.action_space) episode_count = 100 reward = 0 done = False # for i in range(episode_count): # ob = env.reset() # while True: # action = agent.act(ob, reward, done) # ob, reward, done, _ = env.step(1) # if done: # break for i in range(episode_count): ob = env.reset() while True: key = readchar.readkey() # Choose an action from keyboard if key not in arrow_keys.keys(): print("Game aborted!") break action = arrow_keys[key] state, reward, done, info = env.step(action) if done: print("Finished with reward", reward) break
action = agent.get_action(history) # action if action == 0: real_action = [0, 0, 0, 1, 1, 1] # Right + A + B elif action == 1: real_action = [0, 0, 0, 0, 1, 0] # A elif action == 2: real_action = [0, 0, 0, 1, 1, 0] # Right + A elif action == 3: real_action = [0, 0, 0, 1, 0, 1] # Right + B elif action == 4: real_action = [0, 0, 0, 0, 1, 1] # A + B # 선택한 행동으로 환경에서 한 타임스텝 진행 next_observe, reward, done, info = env.step(real_action) # 각 타임스텝마다 상태 전처리 # next_state = pre_processing(next_observe) next_state = np.reshape([next_observe], (1, 84, 84, 1)) next_history = np.append(next_state, history[:, :, :, :3], axis=3) agent.avg_q_max += np.amax( agent.model.predict(np.float32(history))[0]) reward = np.clip(reward, -1., 1.) # reward를 -1 ~ 1 사이의 값으로 만듬 # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장 후 학습 agent.append_sample(history, action, reward, next_history, dead) # print ("global_step : " ,global_step)
def main(): MAX_BUFFER_SIZE = 100000 MAX_EPISODES = 10000 TRAIN_EPISODE = 100 TARGET_UPDATE_EPS = 1000 batch_size = 32 n_size = 84 discount = 0.99 checkpoint_dir = './checkpoints' save_file_name = 'mario_weight_2.ckpt' # 1. Create gym environment env = gym.make("ppaquette/SuperMarioBros-1-1-v0") # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) #replay_buffer = PrioritizedReplayBuffer(MAX_BUFFER_SIZE, alpha=prioritized_replay_alpha) replay_buffer = ReplayBuffer(MAX_BUFFER_SIZE) sess = tf.Session() mainDQN = DQN(sess, name="main") targetDQN = DQN(sess, name="target") dqn_var_list = targetDQN.var_list sess.run(tf.global_variables_initializer()) copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) saver = tf.train.Saver(var_list=dqn_var_list) for eps in range(MAX_EPISODES): # decaying epsilon greedy e = 1. / ((eps / 10) + 1) done = False step_count = 0 state = env.reset() state_queue = deque(maxlen=4) next_state_queue = deque(maxlen=4) state_queue.append(state) next_state_queue.append(state) prev_100 = 0 curr_100 = 0 while not done: step_count += 1 # cumulate 4 frames if step_count < 4: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) state_queue.append(next_state) next_state_queue.append(next_state) continue # training starts if np.random.rand() < e: action = env.action_space.sample() else: # Choose an action by greedily from the Q-network action = np.argmax( mainDQN.predict( np.reshape(np.array(state_queue), [1, n_size, n_size, 4]))) # Get new state and reward from environment next_state, reward, done, _ = env.step(action) if done: # Penalty reward = -100 curr_100 += reward next_state_queue.append(next_state) replay_buffer.add(np.array(state_queue), action, reward, np.array(next_state_queue), done) if step_count % TRAIN_EPISODE == 0: states, actions, rewards, next_states, _ = replay_buffer.sample( batch_size) states, next_states = np.reshape( states, [batch_size, n_size, n_size, 4]), np.reshape( next_states, [batch_size, n_size, n_size, 4]) Q_t = targetDQN.predict(next_states) Q_m = mainDQN.predict(states) Q_t = np.max(Q_t, axis=1) estimates = rewards + discount * Q_t Q_m[np.arange(batch_size), actions] = estimates loss = mainDQN.update(states, Q_m) print("eps: {} step: {} loss: {}".format( eps, step_count, loss)) if curr_100 > prev_100: save_path = saver.save( sess, os.path.join(checkpoint_dir, save_file_name)) print("Model saved in file: %s" % save_path) prev_100 = curr_100 curr_100 = 0 if step_count % TARGET_UPDATE_EPS == 0: sess.run(copy_ops) state_queue.append(next_state)