Beispiel #1
0
def play_game(options):
    """Play flappy bird with pretrained dqn model

       weight -- model file name containing weight of dqn
       best -- if the model is best or not
    """
    model = QNetwork()
    if options.ckpt_path is None:
        print ('you should give weight file name.')
        return
    print ('load previous model weight: {}'.format(options.ckpt_path))
    episode, epsilon = load_checkpoint(options.ckpt_path, model)

    if options.cuda:
        model = model.cuda()

    algorithm = DQN(model, optim, epsilon, options)

    algorithm.set_eval()
    bird_game = game.GameState()
    bird_game.FPS = 480

    action = [1, 0]
    o, r, terminal = bird_game.frame_step(action)
    o = preprocess(o)

    rpm = ReplayMemory(1, options)
    rpm.append(o, action, r, terminal)

    start = time.time()
    fc = 0
    score = 0
    while True:
        prev_o, a, r, o, terminal = rpm.sample(1)

        # q = algorithm(o).cpu().detach().numpy()[0]

        score = max(score, bird_game.score)
        action = algorithm.get_optim_action(o)
        o, r, terminal = bird_game.frame_step(action)
        
        o = preprocess(o)

        # img = Image.fromarray((o*255).astype(np.uint8)).convert(mode='L')
        # img.save(f'{fc}-{r}-{q.argmax()}.png')
        # fc += 1
        if terminal or score > options.max_score*2:
            break

        rpm.append(o, action, r, terminal)

    ela = time.time() - start
    print(f'Final Score {score}, FPS{bird_game.FPS}, {ela//60}m{ela%60}s')
    

# if __name__ == "__main__":
#     main()
Beispiel #2
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        if torch.cuda.is_available():
            self.qnetwork_local = self.qnetwork_local.cuda()
            self.qnetwork_target = self.qnetwork_target.cuda()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #3
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 fc1_units=64,
                 fc2_units=64,
                 fc3_units=None,
                 double_q=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_q = double_q

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed,
                                       fc1_units, fc2_units,
                                       fc3_units).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed,
                                        fc1_units, fc2_units,
                                        fc3_units).to(device)
        if torch.cuda.is_available():
            self.qnetwork_local.cuda()
            self.qnetwork_target.cuda()
        else:
            self.qnetwork_local.cpu()
            self.qnetwork_target.cpu()
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=LR,
                                    weight_decay=WD)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def get_action(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        with torch.no_grad():
            output = self.qnetwork_local.forward(state)
            action_values = self.qnetwork_local.forward(state)

        if random.random() <= eps:
            return np.random.choice(np.arange(self.action_size))
        else:
            return output.argmax().item()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # double q learning
        argmax_a = self.qnetwork_local.forward(next_states).detach().argmax(
            dim=1).unsqueeze(dim=1)
        a_val = self.qnetwork_target.forward(next_states).detach()
        Q_targets_next = a_val.gather(1, argmax_a)

        Q_targets = rewards + GAMMA * Q_targets_next
        Q_expected = self.qnetwork_local.forward(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #4
0
def train_dqn(options):
    max_episode = options.max_episode

    flappyBird = game.GameState()
    print(f'FPS {flappyBird.FPS}')

    rpm = ReplayMemory(options.rpm_size, options)  # DQN的经验回放池

    model = QNetwork()
    if options.resume and options.ckpt_path is not None:
        print ('load previous model weight: {}'.format(options.ckpt_path))
        episode, epsilon = load_checkpoint(options.ckpt_path, model)
    else:
        epsilon = options.init_e
        episode = 0

    if options.cuda:
        model = model.cuda()

    optimizer = optim.Adam(model.parameters(), lr=options.lr)
    algorithm = DQN(model, optimizer, epsilon, options)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < options.rpm_size/4:
        run_episode(algorithm, flappyBird, rpm, options)

    print(f'observation done {len(rpm)}')

    # 开始训练
    logname = time.strftime('%Y-%m-%d %M-%I-%S' , time.localtime())
    logger = get_logger(f'log/{logname}.log')
    best_reward = 0
    max_score = 0
    begin = time.time()
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part

        reward, loss, score = run_episode(algorithm, flappyBird, rpm, options)
        algorithm.epsilon = max(algorithm.final_e, algorithm.epsilon - algorithm.e_decrement)
        episode += 1
        max_score = max(max_score, score)

        if (episode)%10 == 0:
            logger.info('episode:[{}/{}]\tscore:{:.3f}\ttrain_reward:{:.5f}\tloss:{:.5f}'.format(
                episode, max_episode, score, reward, loss))
        
        # test part
        if (episode)%options.evaluate_freq == 0:
            eval_reward, score = evaluate(flappyBird, algorithm, options)
            mid = time.time()
            elapsed = round(mid-begin)
            logger.info('episode:[{}/{}]\tscore:{:.3f}\tepsilon:{:.5f}\ttest_reward:{:.5f}\t{}:{}'.format(
                episode, max_episode, score, algorithm.epsilon, eval_reward, elapsed//60, elapsed%60))
            if eval_reward > best_reward:
                save_path = f'ckpt/best_{score}.ckpt'
                save_checkpoint({
                    'episode': episode,
                    'epsilon': algorithm.epsilon,
                    'state_dict': model.state_dict(),
                    }, False, save_path
                )

        if (episode)%1000 == 0:
            save_path = f'ckpt/episode_{episode}.ckpt'
            save_checkpoint({
                'episode': episode,
                'epsilon': algorithm.epsilon,
                'state_dict': model.state_dict(),
                }, False, save_path
            )

    # 训练结束,保存模型
    save_path = f'ckpt/final_{episode}_{score}.ckpt'
    save_checkpoint({
        'episode': episode,
        'epsilon': algorithm.epsilon,
        'state_dict': model.state_dict(),
        }, False, save_path)

    mid = time.time()
    elapsed = round(mid-begin)
    logger.info('training completed, {} episiode, {}m {}s'.format(max_episode, elapsed//60, elapsed%60))
    print(f'max_score {max_score}')
Beispiel #5
0
class Agent():
    def __init__(self, state_size, action_size, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, random_seed)
        self.qnetwork_target = QNetwork(state_size, action_size, random_seed)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Move to GPU if CUDA is available
        if train_on_gpu:
            self.qnetwork_local.cuda()
            self.qnetwork_target.cuda()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        self.t_step = (self.t_step + 1) % UPDATE_EVERY

        if self.t_step == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().unsqueeze(0)
        if train_on_gpu:
            state = state.cuda()
        self.qnetwork_local.eval()
        action_values = self.qnetwork_local(Variable(state, volatile=True))
        self.qnetwork_local.train()
        max_action = np.argmax(action_values.cpu().data.numpy())
        policy_s = np.ones(self.action_size) * eps / self.action_size
        policy_s[max_action] = 1 - eps + (eps / self.action_size)
        action = np.random.choice(np.arange(self.action_size), p=policy_s)
        return action

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute loss
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ----------------------- update target network ----------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)