def test_full_run(): from atari_py.ale_python_interface import ALEInterface game = "atari_roms/breakout.bin" ale = ALEInterface() # Get & Set the desired settings ale.setInt('random_seed', 123) # Load the ROM file ale.loadROM(game) # Get the list of legal actions legal_actions = ale.getLegalActionSet() batch_size = 10 exp_replay = ReplayBuffer(batch_size) (screen_width, screen_height) = ale.getScreenDims() import os tot_m, used_m, free_m = os.popen("free -th").readlines()[-1].split()[1:] last_counter = 0 random_state = np.random.RandomState(218) print("initial: {}, {}, {}".format(tot_m, used_m, free_m)) # Play 2k episodes for episode in range(2000): total_reward = 0 S = np.zeros(screen_width * screen_height, dtype=np.uint8) S = S.reshape(screen_height, screen_width)[:84, :84] this_counter = exp_replay.sent_counter if this_counter > last_counter + 1000: last_counter = this_counter tot_m, used_m, free_m = os.popen( "free -th").readlines()[-1].split()[1:] # the first three entries should match til 1M steps # then the second 2 should continue in lock step print("{}: {}, {}; {}, {}, {}".format( exp_replay.sent_counter, len(exp_replay.memory), len(exp_replay.reverse_experience_lookup.keys()), tot_m, used_m, free_m)) while not ale.game_over(): S_prime = np.zeros(screen_width * screen_height, dtype=np.uint8) ale.getScreen(S_prime) S_prime = S_prime.reshape(screen_height, screen_width)[:84, :84] a = random_state.choice(len(legal_actions)) action = legal_actions[a] # Apply an action and get the resulting reward reward = ale.act(action) won = 0 ongoing_flag = 1 experience = (S_prime, action, reward, won, ongoing_flag) S = S_prime exp_replay.add_experience(experience) batch = exp_replay.get_minibatch() batch = exp_replay.get_minibatch(index_list=[1, 2, 3, 10, 11]) if batch is not None: mb_S = batch[0] other_info = batch[1] del batch total_reward += reward print 'Episode', episode, 'ended with score:', total_reward ale.reset_game() lst = 0 for i in range(10000): if i > lst + 1000: tot_m, used_m, free_m = os.popen( "free -th").readlines()[-1].split()[1:] print("POST MEM {}: {}, {}; {}, {}, {}".format( exp_replay.sent_counter, len(exp_replay.memory), len(exp_replay.reverse_experience_lookup.keys()), tot_m, used_m, free_m)) lst = i batch = exp_replay.get_minibatch() mb_S = batch[0] other_info = batch[1] from IPython import embed embed() raise ValueError()
class DQN(object): def __init__(self,state_space,action_space,seed,update_every,batch_size,buffer_size,learning_rate): self.action_space = action_space self.state_space = state_space self.seed = random.seed(seed) self.batch_size = batch_size self.buffer_size = buffer_size self.learning_rate = learning_rate self.update_every = update_every self.qnetwork_local = QNetwork(state_space,action_space) self.qnetwork_target = QNetwork(state_space,action_space) self.optimizer = optim.Adam(self.qnetwork_local.parameters(),lr=learning_rate) # Initialize replaybuffer self.memory = ReplayBuffer(action_space,buffer_size,buffer_size,seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self,state,action,reward,next_state,done,GAMMA): # Save the experience self.memory.add_experience(state,action,reward,next_state,done) # learn from the experience self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: if len(self.memory) > self.buffer_size: experiences = self.memory.sample() self.learn(experiences,GAMMA) def act(self,state,eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.sample(np.arange(self.action_space)) def learn(self,experiences,GAMMA): states,actions,rewards,next_states,dones = experiences target_values = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) targets = reward + (GAMMA * target_values * (1-done)) action_values = self.qnetwork_local(states).gather(1,actions) loss = F.mse_loss(action_values,targets) loss.backward() self.optimizer.step() soft_update(TAU) def soft_update(self,tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ for local_param,target_param in zip(self.qnetwork_local.parameters(),self.qnetwork_target.parameters()): local_param.data.copy_(tau*local_param.data + (1-tau)*target_param.data) # self.qnetwork_local.parameters() = TAU*self.qnetwork_local.parameters() + (1-TAU)*self.qnetwork_target.parameters()
def main(argv): env = gym.make(config.game_name) env = wrap_deepmind(env, config.episode_life, config.preprocess, config.max_and_skip, config.clip_rewards, config.no_op_reset, config.scale) num_actions = env.action_space.n sess = tf.Session() agent = DQNAgent(sess=sess, num_actions=num_actions) sess.run(tf.global_variables_initializer()) rewards = tf.placeholder(dtype=tf.float32, shape=(None), name='reward') saver = tf.train.Saver() tf.summary.scalar('avg.reward/ep', tf.reduce_mean(rewards)) tf.summary.scalar('max.reward/ep', tf.reduce_max(rewards)) writer = tf.summary.FileWriter('logs_12_v4_allwrap_constant_lr', sess.graph) summary_merged = tf.summary.merge_all() episode_rewards = [] batch_loss = [] replay_buffer = ReplayBuffer() time_step = 0 episode = 0 total_reward_list = [] #scheduler e = e_scheduler() lr = lr_scheduler() while time_step < config.MAX_TIME_STEPS: done = False total_reward = 0 ''' frame --> 84 x 84 x 1 state --> 84 x 84 x 4 ''' frame = env.reset() frame_scale = np.array(frame).astype(np.float32) / 255.0 #맨 처음 frame을 받아올때는 past_frames이 존재하지않으므로, (84x84)의 0인 행렬을 받아서 초기화 past_frames = np.zeros( (config.height, config.width, agent.history_length - 1), dtype=np.uint8) #저장용 past_frames_scale = np.zeros( (config.height, config.width, agent.history_length - 1), dtype=np.float32) #학습용 state = agent.process_state_into_stacked_frames(frame, past_frames, past_state=None) state_scale = np.array(state).astype(np.float32) / 255.0 while not done: if np.random.rand() < e.get( ) or time_step < config.REPLAY_START_SIZE: action = env.action_space.sample() else: action = agent.predict_action(state_scale) time_step += 1 frame_after, reward, done, info = env.step(action) frame_after_scale = np.array(frame_after).astype( np.float32) / 255.0 replay_buffer.add_experience(state, action, reward, done) if not done: #+21 or -21 #새로 생긴 frame을 과거 state에 더해줌. state_after = agent.process_state_into_stacked_frames( frame_after, past_frames, past_state=state) state_after_scale = np.array(state_after).astype( np.float32) / 255.0 past_frames = np.concatenate((past_frames, frame_after), axis=2) past_frames = past_frames[:, :, 1:] past_frames_scale = np.array(past_frames).astype( np.float32) / 255.0 #print(past_frames.shape) state = state_after state_scale = state_after_scale total_reward += reward #training if time_step > config.REPLAY_START_SIZE and time_step % config.LEARNING_FREQ == 0: e.update(time_step) lr.update(time_step) b_state, b_action, b_reward, b_state_after, b_done = replay_buffer.sample_batch( config.BATCH_SIZE) Q_of_state_after = agent.sess.run( agent.target_Q, feed_dict={agent.target_state: b_state_after}) target_Q_p = [] for i in range(config.BATCH_SIZE): if b_done[i]: target_Q_p.append(b_reward[i]) else: target_Q_p.append(b_reward[i] + config.DISCOUNT_FACTOR * np.max(Q_of_state_after[i])) agent.sess.run( [agent.train_step, agent.Q, agent.loss], { agent.target_Q_p: target_Q_p, agent.action: b_action, agent.state: b_state, agent.lr: lr.get() }) if time_step % config.target_UPDATE_FREQ == 0: agent.sess.run(agent.update_fn) if time_step % config.REWARD_RECORD_FREQ == 0 and len( total_reward_list) != 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if time_step % config.MODEL_RECORD_FREQ == 0: saver.save(sess, 'model_12_v4_allwrap_constant_lr/dqn.ckpt', global_step=time_step) #학습과 상관 x episode += 1 #For Debugging if episode % 100 == 0: print('episode : %d 점수: %d' % (episode, total_reward)) total_reward_list.append(total_reward)