def test_full_run():
    from atari_py.ale_python_interface import ALEInterface

    game = "atari_roms/breakout.bin"

    ale = ALEInterface()

    # Get & Set the desired settings
    ale.setInt('random_seed', 123)

    # Load the ROM file
    ale.loadROM(game)

    # Get the list of legal actions
    legal_actions = ale.getLegalActionSet()

    batch_size = 10
    exp_replay = ReplayBuffer(batch_size)

    (screen_width, screen_height) = ale.getScreenDims()

    import os
    tot_m, used_m, free_m = os.popen("free -th").readlines()[-1].split()[1:]
    last_counter = 0
    random_state = np.random.RandomState(218)
    print("initial: {}, {}, {}".format(tot_m, used_m, free_m))
    # Play 2k episodes
    for episode in range(2000):
        total_reward = 0
        S = np.zeros(screen_width * screen_height, dtype=np.uint8)
        S = S.reshape(screen_height, screen_width)[:84, :84]
        this_counter = exp_replay.sent_counter
        if this_counter > last_counter + 1000:
            last_counter = this_counter
            tot_m, used_m, free_m = os.popen(
                "free -th").readlines()[-1].split()[1:]
            # the first three entries should match til 1M steps
            # then the second 2 should continue in lock step
            print("{}: {}, {}; {}, {}, {}".format(
                exp_replay.sent_counter, len(exp_replay.memory),
                len(exp_replay.reverse_experience_lookup.keys()), tot_m,
                used_m, free_m))
        while not ale.game_over():
            S_prime = np.zeros(screen_width * screen_height, dtype=np.uint8)
            ale.getScreen(S_prime)
            S_prime = S_prime.reshape(screen_height, screen_width)[:84, :84]
            a = random_state.choice(len(legal_actions))
            action = legal_actions[a]
            # Apply an action and get the resulting reward
            reward = ale.act(action)
            won = 0
            ongoing_flag = 1
            experience = (S_prime, action, reward, won, ongoing_flag)
            S = S_prime
            exp_replay.add_experience(experience)
            batch = exp_replay.get_minibatch()
            batch = exp_replay.get_minibatch(index_list=[1, 2, 3, 10, 11])
            if batch is not None:
                mb_S = batch[0]
                other_info = batch[1]
            del batch
            total_reward += reward
        print 'Episode', episode, 'ended with score:', total_reward
        ale.reset_game()

    lst = 0
    for i in range(10000):
        if i > lst + 1000:
            tot_m, used_m, free_m = os.popen(
                "free -th").readlines()[-1].split()[1:]
            print("POST MEM {}: {}, {}; {}, {}, {}".format(
                exp_replay.sent_counter, len(exp_replay.memory),
                len(exp_replay.reverse_experience_lookup.keys()), tot_m,
                used_m, free_m))
            lst = i

        batch = exp_replay.get_minibatch()
        mb_S = batch[0]
        other_info = batch[1]
    from IPython import embed
    embed()
    raise ValueError()
Beispiel #2
0
class DQN(object):
    def __init__(self,state_space,action_space,seed,update_every,batch_size,buffer_size,learning_rate):
        self.action_space = action_space
        self.state_space = state_space
        self.seed = random.seed(seed)
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.update_every = update_every
        
        self.qnetwork_local = QNetwork(state_space,action_space)
        self.qnetwork_target = QNetwork(state_space,action_space)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),lr=learning_rate)
        # Initialize replaybuffer
        self.memory = ReplayBuffer(action_space,buffer_size,buffer_size,seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
    def step(self,state,action,reward,next_state,done,GAMMA):
        # Save the experience
        self.memory.add_experience(state,action,reward,next_state,done)
        
        # learn from the experience
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            if len(self.memory) > self.buffer_size:
                experiences = self.memory.sample()
                self.learn(experiences,GAMMA)
        
    def act(self,state,eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.sample(np.arange(self.action_space))
        
    def learn(self,experiences,GAMMA):
        
        states,actions,rewards,next_states,dones = experiences
        
        target_values = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        targets = reward + (GAMMA * target_values * (1-done))
        action_values = self.qnetwork_local(states).gather(1,actions)
        loss = F.mse_loss(action_values,targets)
        loss.backward()
        self.optimizer.step()
        soft_update(TAU)
        
    def soft_update(self,tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        for local_param,target_param in zip(self.qnetwork_local.parameters(),self.qnetwork_target.parameters()):
            local_param.data.copy_(tau*local_param.data + (1-tau)*target_param.data)
#         self.qnetwork_local.parameters() = TAU*self.qnetwork_local.parameters() + (1-TAU)*self.qnetwork_target.parameters()
Beispiel #3
0
def main(argv):

    env = gym.make(config.game_name)
    env = wrap_deepmind(env, config.episode_life, config.preprocess,
                        config.max_and_skip, config.clip_rewards,
                        config.no_op_reset, config.scale)

    num_actions = env.action_space.n

    sess = tf.Session()

    agent = DQNAgent(sess=sess, num_actions=num_actions)

    sess.run(tf.global_variables_initializer())

    rewards = tf.placeholder(dtype=tf.float32, shape=(None), name='reward')

    saver = tf.train.Saver()
    tf.summary.scalar('avg.reward/ep', tf.reduce_mean(rewards))
    tf.summary.scalar('max.reward/ep', tf.reduce_max(rewards))

    writer = tf.summary.FileWriter('logs_12_v4_allwrap_constant_lr',
                                   sess.graph)
    summary_merged = tf.summary.merge_all()

    episode_rewards = []
    batch_loss = []

    replay_buffer = ReplayBuffer()
    time_step = 0
    episode = 0
    total_reward_list = []

    #scheduler
    e = e_scheduler()
    lr = lr_scheduler()

    while time_step < config.MAX_TIME_STEPS:

        done = False
        total_reward = 0
        '''
        frame --> 84 x 84 x 1
        state --> 84 x 84 x 4
        '''

        frame = env.reset()

        frame_scale = np.array(frame).astype(np.float32) / 255.0

        #맨 처음 frame을 받아올때는 past_frames이 존재하지않으므로, (84x84)의 0인 행렬을 받아서 초기화
        past_frames = np.zeros(
            (config.height, config.width, agent.history_length - 1),
            dtype=np.uint8)  #저장용
        past_frames_scale = np.zeros(
            (config.height, config.width, agent.history_length - 1),
            dtype=np.float32)  #학습용

        state = agent.process_state_into_stacked_frames(frame,
                                                        past_frames,
                                                        past_state=None)
        state_scale = np.array(state).astype(np.float32) / 255.0

        while not done:

            if np.random.rand() < e.get(
            ) or time_step < config.REPLAY_START_SIZE:
                action = env.action_space.sample()
            else:
                action = agent.predict_action(state_scale)
            time_step += 1

            frame_after, reward, done, info = env.step(action)

            frame_after_scale = np.array(frame_after).astype(
                np.float32) / 255.0

            replay_buffer.add_experience(state, action, reward, done)

            if not done:  #+21 or -21

                #새로 생긴 frame을 과거 state에 더해줌.
                state_after = agent.process_state_into_stacked_frames(
                    frame_after, past_frames, past_state=state)

                state_after_scale = np.array(state_after).astype(
                    np.float32) / 255.0

                past_frames = np.concatenate((past_frames, frame_after),
                                             axis=2)
                past_frames = past_frames[:, :, 1:]

                past_frames_scale = np.array(past_frames).astype(
                    np.float32) / 255.0

                #print(past_frames.shape)
                state = state_after
                state_scale = state_after_scale

            total_reward += reward

            #training
            if time_step > config.REPLAY_START_SIZE and time_step % config.LEARNING_FREQ == 0:
                e.update(time_step)
                lr.update(time_step)

                b_state, b_action, b_reward, b_state_after, b_done = replay_buffer.sample_batch(
                    config.BATCH_SIZE)

                Q_of_state_after = agent.sess.run(
                    agent.target_Q,
                    feed_dict={agent.target_state: b_state_after})

                target_Q_p = []
                for i in range(config.BATCH_SIZE):
                    if b_done[i]:
                        target_Q_p.append(b_reward[i])
                    else:
                        target_Q_p.append(b_reward[i] +
                                          config.DISCOUNT_FACTOR *
                                          np.max(Q_of_state_after[i]))

                agent.sess.run(
                    [agent.train_step, agent.Q, agent.loss], {
                        agent.target_Q_p: target_Q_p,
                        agent.action: b_action,
                        agent.state: b_state,
                        agent.lr: lr.get()
                    })

            if time_step % config.target_UPDATE_FREQ == 0:
                agent.sess.run(agent.update_fn)

            if time_step % config.REWARD_RECORD_FREQ == 0 and len(
                    total_reward_list) != 0:
                summary = sess.run(summary_merged,
                                   feed_dict={rewards: total_reward_list})
                writer.add_summary(summary, time_step)
                total_reward_list = []

            if time_step % config.MODEL_RECORD_FREQ == 0:
                saver.save(sess,
                           'model_12_v4_allwrap_constant_lr/dqn.ckpt',
                           global_step=time_step)

        #학습과 상관 x
        episode += 1
        #For Debugging
        if episode % 100 == 0:
            print('episode : %d 점수: %d' % (episode, total_reward))

        total_reward_list.append(total_reward)