Beispiel #1
0
def do_q_learning(env, reward_function, train_episodes, figure=False):
    alpha = 0.01
    gamma = 0.9
    epsilon = 0.1
    policy = DQNPolicy(env, lr=alpha, gamma=gamma, input=2,
                       output=4)  # 4 actions output, up, right, down, left
    replay_buffer = ReplayBuffer()
    # Play with a random policy and see
    # run_current_policy(env.env, policy)
    agg_interval = 100
    avg_history = {'episodes': [], 'timesteps': [], 'reward': []}
    # Train the network to predict actions for each of the states
    for episode_i in range(train_episodes):
        episode_timestep = 0
        episode_reward = 0.0
        env.__init__()
        # todo : the first current state should be 0
        cur_state = env.cur_state
        counter = 0
        done = False
        while not done:
            # Let each episode be of 30 steps
            counter += 1
            done = counter >= 30

            # todo : check if this line is working
            action = policy.select_action(cur_state.reshape(1, -1), epsilon)

            # take action in the environment
            next_state = env.step(action)
            reward = reward_function(next_state)

            # add the transition to replay buffer
            replay_buffer.add(cur_state, action, next_state, reward, done)

            # sample minibatch of transitions from the replay buffer
            # the sampling is done every timestep and not every episode
            sample_transitions = replay_buffer.sample()

            # update the policy using the sampled transitions
            policy.update_policy(**sample_transitions)

            episode_reward += reward
            episode_timestep += 1

            cur_state = next_state

        avg_history['episodes'].append(episode_i + 1)
        avg_history['timesteps'].append(episode_timestep)
        avg_history['reward'].append(episode_reward)

        learning_policy_progress.update()

    if figure:
        plt.plot(avg_history['episodes'], avg_history['reward'])
        plt.title('Reward')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.show()
    return policy.q_model
Beispiel #2
0
def test_buffer_replace():
    shape = (2, 2)
    capacity = 2
    buffer = ReplayBuffer(capacity)
    for i in range(10):
        x = onp.ones(shape) * i
        a, r = i, i
        discount = 1.0
        timestep = dm_env.TimeStep(dm_env.StepType.FIRST, r, discount, x)
        buffer.add(timestep, a, timestep)
        logging.debug("i: {},  r: {}, len(buffer): {}".format(
            i, capacity, len(buffer)))
    # make sure the buffer recycles if adding more elements than its capacity
    assert len(buffer) == capacity
    # make sure the oldest elements are recycled
    assert onp.array_equal(
        onp.array([buffer[i].s for i in range(len(buffer))]),
        onp.array([[[8.0, 8.0], [8.0, 8.0]], [[9.0, 9], [9.0, 9.0]]],
                  dtype=onp.float32),
    )
    assert onp.array_equal(
        onp.array([buffer[i].r for i in range(len(buffer))]),
        onp.array([8.0, 9.0], dtype=onp.float32),
    )
    assert onp.array_equal(
        onp.array([buffer[i].a for i in range(len(buffer))]),
        onp.array([8.0, 9.0], dtype=onp.float32),
    )
    # try sampling with n < len(buffer)
    batch = buffer.sample(1)
    assert len(batch[0]) == 1
    logging.debug(batch)
    # try sampling wiht n == len(buffer)
    batch = buffer.sample(2)
    assert len(batch[0]) == len(buffer)
    logging.debug(batch)
    # try sampling with n > len(buffer)
    batch = buffer.sample(3)
    assert len(batch[0]) == len(buffer)
    logging.debug(batch)
    return
Beispiel #3
0
class DQNAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers=[64, 64],
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 learning_rate=5e-4,
                 update_every=4):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (list of int ; optional): number of each layer nodes
            buffer_size (int ; optional): replay buffer size
            batch_size (int; optional): minibatch size
            gamma (float; optional): discount factor
            tau (float; optional): for soft update of target parameters
            learning_rate (float; optional): learning rate
            update_every (int; optional): how often to update the network
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = learning_rate
        self.update_every = update_every

        # detect GPU device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        model_params = [state_size, action_size, seed, hidden_layers]
        self.qnetwork_local = QNetwork(*model_params).to(self.device)
        self.qnetwork_target = QNetwork(*model_params).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed, self.device)
        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every self.update_every time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Calculate target value
        self.qnetwork_target.eval()
        with torch.no_grad():
            Q_dash = self.qnetwork_target(next_states)
            Q_dash_max = torch.max(Q_dash, dim=1, keepdim=True)[0]
            y = rewards + gamma * Q_dash_max * (1 - dones)
        self.qnetwork_target.train()

        # Predict Q-value
        self.optimizer.zero_grad()
        Q = self.qnetwork_local(states)
        y_pred = Q.gather(1, actions)

        # TD-error
        loss = torch.sum((y - y_pred)**2)

        # Optimize
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #4
0
    cur_state = cp_env.reset()

    while not done:
        # select action
        action = cp_policy.select_action(cur_state.reshape(1, -1), cp_epsilon)

        # take action in the environment
        next_state, reward, done, info = cp_env.step(action)

        # add the transition to replay buffer
        replay_buffer.add(cur_state, action, next_state, reward, done)

        # sample minibatch of transitions from the replay buffer
        # the sampling is done every timestep and not every episode
        sample_transitions = replay_buffer.sample()

        # update the policy using the sampled transitions
        cp_policy.update_policy(**sample_transitions)

        episode_reward += reward
        episode_timestep += 1

        cur_state = next_state

    avg_reward += episode_reward
    avg_timestep += episode_timestep

    if (episode_i + 1) % agg_interval == 0:
        cp_avg_history['episodes'].append(episode_i + 1)
        cp_avg_history['timesteps'].append(avg_timestep / float(agg_interval))
Beispiel #5
0
        # sample_transitions = replay_buffer_weighted.sample(100)

        # update the policy using the sampled transitions
        # loss2 = update_weighted_policy_stochastic(cur_state, next_state, reward, action, sample_transitions['next_states'], (episode_i//weight_decay)+1)

        episode_weighted_reward += reward
        episode_timestep_weighted += 1
        # loss2_cumulative += loss2
        cur_state = next_state

    # update the policy every update_episode episodes
    if (episode_i+1) % update_episode == 0:
        target_list = torch.Tensor()
        predicted_list = torch.Tensor()

        sample_states = replay_buffer_weighted.sample(100)
        for el in range(sample_states['cur_states'].shape[0]):
            sample_transitions = replay_buffer_weighted.sample(100)
            target, predicted = get_target_and_predicted(sample_states['cur_states'][el],
                                                         sample_states['next_states'][el],
                                                         sample_states['rewards'][el],
                                                         sample_states['actions'][el],
                                                         sample_transitions['next_states'],
                                                         (episode_i//weight_decay) + 1)
            target_list = torch.cat([target_list, target])
            predicted_list = torch.cat([predicted_list, predicted])

        # the implementation is (input-target)^2
        optimizer_weighted.zero_grad()

        # normalizing to zero mean and std dev
        if done:
            state = env.reset()


print('Start Training ...')
play_to_train(train_num_frames, policy_model, target_model,
              train_replay_buffer)
print('Done Training ...')

print('Playing before sampling ...')
play_to_sample(sample_num_frames, policy_model, analysis_replay_buffer)

# Sample 1000 frames from the sample replay buffer for analysis
print('Sampling from the analysis replay buffer ...')
states, actions, rewards, next_states, done = analysis_replay_buffer.sample(
    1000)

with open('sampled_states.npy', 'wb') as states_f:
    np.save(states_f, states)

with open('sampled_actions.npy', 'wb') as actions_f:
    np.save(actions_f, np.array(actions))

with open('sampled_rewards.npy', 'wb') as rewards_f:
    np.save(rewards_f, np.array(rewards))

with open('sampled_next_states.npy', 'wb') as next_states_f:
    np.save(next_states_f, next_states)

with open('sampled_done.npy', 'wb') as done_f:
    np.save(done_f, np.array(done))
class Agent():
    """Basic experinece replay agent."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 checkpoint_file='checkpoint.pth'):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            buffer_size(int):  replay buffer size
            batch_size(int):   minibatch size
            gamma:             discount factor
            tau:               for soft update of target parameters
            lr:                learning rate 
            update_every:      how often to update the network
            
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.checkpoint_file = checkpoint_file
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed,
                                   self.device)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def train(self,
              env,
              n_episodes=2000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
        """Train Agent by playing simulator

        Params
        ======
            n_episodes (int): maximum number of training episodes
            max_t (int): maximum number of timesteps per episode
            eps_start (float): starting value of epsilon, for epsilon-greedy action selection
            eps_end (float): minimum value of epsilon
            eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        """
        scores = []  # list containing scores from each episode
        moving_avgs = []  # list of moving averages
        scores_window = deque(maxlen=100)  # last 100 scores
        brain_name = env.brain_names[0]  # get env default branin name
        env_info = env.reset(
            train_mode=False)[brain_name]  # intialize the environment
        eps = eps_start  # initialize epsilon
        for i_episode in range(1, n_episodes + 1):
            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations[0]  # get the next state
            score = 0
            for t in range(max_t):
                action = self.act(state, eps).astype(int)
                env_info = env.step(action)[brain_name]
                next_state = env_info.vector_observations[
                    0]  # get the next state
                reward = env_info.rewards[0]  # get the reward
                done = env_info.local_done[0]  # see if episode has finished
                self.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            moving_avg = np.mean(scores_window)  # calculate moving average
            moving_avgs.append(moving_avg)  # save most recent moving average
            eps = max(eps_end, eps_decay * eps)  # decrease epsilon
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, moving_avg))
            if moving_avg >= 13.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, moving_avg))
                self.save()
                break
        return scores, moving_avgs

    def test(self, env, num_episodes=10):
        brain_name = env.brain_names[0]
        scores = []  # list of scores
        avg_scores = []  # list of average scores
        for i_episode in range(1, num_episodes + 1):
            env_info = env.reset(
                train_mode=False)[brain_name]  # reset the environment
            state = env_info.vector_observations[0]  # get the current state
            score = 0  # initialize the score
            t = 1
            while True:
                action = self.act(state, eps=0)  # select an action
                env_info = env.step(action)[
                    brain_name]  # send the action to the environment
                next_state = env_info.vector_observations[
                    0]  # get the next state
                reward = env_info.rewards[0]  # get the reward
                done = env_info.local_done[0]  # see if episode has finished
                score += reward  # update the score
                state = next_state  # roll over the state to next time step
                # print('empisode: {}, step: {}, reward: {}, score: {}, scores: {}'.format(i_episode, t, reward, score, scores))
                t += 1
                if done:  # exit loop if episode finished
                    scores.append(score)
                    avg_scores.append(np.mean(scores))
                    print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                        i_episode, np.mean(scores)))
                    break
        return scores, avg_scores

    def save(self):
        """Save the model
        Params
        ======
            file: checkpoint file name
        """
        torch.save(self.qnetwork_local.state_dict(), self.checkpoint_file)

    def load(self):
        """Load the model
        Params
        ======
            file: checkpoint file name
        """
        self.qnetwork_local.load_state_dict(torch.load(self.checkpoint_file))
Beispiel #8
0
        if len(exp_replay) == buffer_size:
            break
    print(len(exp_replay))

    state = env.reset()
    for step in trange(step, total_steps + 1):

        agent.epsilon = linear_decay(init_epsilon, final_epsilon, step,
                                     decay_steps)

        # play
        _, state = play_and_record(state, agent, env, exp_replay,
                                   timesteps_per_epoch)

        # train
        obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(
            batch_size)

        loss = compute_td_loss(obs_batch,
                               act_batch,
                               reward_batch,
                               next_obs_batch,
                               is_done_batch,
                               agent,
                               target_network,
                               device=device)

        loss.backward()
        #grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
        opt.step()
        opt.zero_grad()
Beispiel #9
0
def train(env_id,
          lr=1e-4,
          gamma=0.99,
          memory_size=1000,
          batch_size=32,
          train_timesteps=10000,
          train_start_time=1000,
          target_update_frequency=1000,
          init_epsilon=1,
          final_epsilon=0.1,
          epsilon_decay=300,
          model_path=None):

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    LOG_PATH = f'logs/dqn_log_{env_id}.txt'

    if get_env_type(env_id) == 'atari':
        env = make_atari(env_id)
        env = wrap_deepmind(env)
        env = wrap_pytorch(env)

        model_type = 'conv'
    else:
        env = gym.make(env_id)

        model_type = 'linear'

    obs_shape = env.observation_space.shape
    num_actions = env.action_space.n

    memory = ReplayBuffer(memory_size)

    agent = DQN(obs_shape, num_actions, lr, gamma, device, model_type)
    policy = EpsilonGreedy(agent, num_actions, init_epsilon, final_epsilon,
                           epsilon_decay)

    # populate replay memory
    obs = env.reset()
    for t in range(train_start_time):

        # uniform random policy
        action = random.randrange(num_actions)
        next_obs, reward, done, _ = env.step(action)
        memory.add(obs, action, reward, next_obs, done)

        obs = next_obs

        if done:
            # start a new episode
            obs = env.reset()

    # for monitoring
    ep_num = 1
    ep_start_time = 1
    episode_reward = 0
    reward_list = []

    # train start
    obs = env.reset()
    for t in tqdm.tqdm(range(1, train_timesteps + 1)):

        # choose action
        action = policy.act(obs, t)
        next_obs, reward, done, _ = env.step(action)
        memory.add(obs, action, reward, next_obs, done)

        obs = next_obs

        # sample batch transitions from memory
        transitions = memory.sample(batch_size)
        # train
        loss = agent.train(transitions)

        # record reward
        episode_reward += reward

        # update target network at every C timesteps
        if t % target_update_frequency == 0:
            agent.update_target()

        if done:
            # start a new episode
            obs = env.reset()

            # write log
            with open(LOG_PATH, 'a') as f:
                f.write(f'{ep_num}\t{episode_reward}\t{ep_start_time}\t{t}\n')

            if model_path is not None:
                # save model
                info = {
                    'epoch': ep_num,
                    'timesteps': t,
                }
                agent.save(model_path, info)

            ep_num += 1
            ep_start_time = t + 1
            reward_list.append(episode_reward)
            episode_reward = 0