Ejemplo n.º 1
0
    def sample_episode(self, Q: Tabular, task: Task, policy: Policy, rewards):

        # to store the episode
        episode = [None] * (self.episode_length)

        # initialize state
        state = task.initial_state()

        # repeat for each step of episode
        for t in range(self.episode_length):

            # choose action from state using policy derived from Q
            action = policy.act(Q, task, state)

            # take action and observe reward and new state
            new_state, reward, done = task.transition(state, action)
            rewards[t] = reward
            episode[t] = (state, action, reward)

            # update state
            state = new_state

            # until state is terminal
            if done:
                break

        return t, episode[0:t]
Ejemplo n.º 2
0
    def run_episode(self, Q : Tabular, task : Task, policy : Policy):
        
        # to compute backup
        rewards = np.zeros(self.episode_length, dtype=float)
        
        # initialize state
        state = task.initial_state()
            
        # repeat for each step of episode
        for t in range(self.episode_length):
                
            # choose action from state using policy derived from Q
            action = policy.act(Q, task, state) 
                
            # take action and observe reward and new state
            new_state, reward, done = task.transition(state, action) 
            rewards[t] = reward  
                
            # update Q
            delta = reward + self.gamma * Q.max_value(new_state) - Q.values(state)[action]
            Q.update(state, action, delta)
                
            # update state
            state = new_state
            
            # until state is terminal
            if done:
                break

        return t, rewards[0:t]
Ejemplo n.º 3
0
    def run_episode(self, Q: Tabular, task: Task, policy: Policy):

        # to compute backups
        rewards = np.zeros(self.episode_length, dtype=float)
        states = [None] * self.episode_length
        actions = [None] * self.episode_length

        # initialize state
        state = task.initial_state()

        # repeat for each step of episode
        for t in range(self.episode_length):

            # choose action from state using policy derived from Q
            action = policy.act(Q, task, state)
            states[t], actions[t] = state, action

            # take action and observe reward and new state
            new_state, reward, done = task.transition(state, action)
            rewards[t] = reward

            # update state and action
            state = new_state

            # until state is terminal
            if done:
                break

        # initialize lambda-average of returns
        lambda_return = reward

        # repeat for each step of episode in reverse order
        T = t
        for t in range(T, -1, -1):

            # compute lambda-average of returns
            if t < T:
                lambda_return = rewards[t] + self.gamma * (
                    (1.0 - self.decay) *
                    Q.values(states[t + 1])[actions[t + 1]] +
                    self.decay * lambda_return)

            # update Q
            delta = lambda_return - Q.values(states[t])[actions[t]]
            Q.update(states[t], actions[t], delta)

        return T, rewards[0:T]
Ejemplo n.º 4
0
    def run_episode(self, Q: Tabular, task: Task, policy: Policy):

        # to compute backups
        rewards = np.zeros(self.episode_length, dtype=float)

        # initialize the e(s, a) matrix
        # note: there is an error in Sutton and Barto since e is reset each episode
        e = defaultdict(lambda: np.zeros(task.valid_actions(), dtype=float))

        # initialize state and action
        state = task.initial_state()
        action = policy.act(Q, task, state)

        # repeat for each step of episode
        for t in range(self.episode_length):

            # take action and observe reward and new state
            new_state, reward, done = task.transition(state, action)
            rewards[t] = reward

            # choose action from state using policy derived from Q
            new_action = policy.act(Q, task, new_state)

            # update e
            e[state][action] += 1.0

            # update trace
            delta = reward + self.gamma * Q.values(
                new_state)[new_action] - Q.values(state)[action]
            for s in e.keys():
                errors = e[s] * delta
                Q.update_all(s, errors)
                e[s] *= self.gamma * self.decay

            # update state and action
            state, action = new_state, new_action

            # until state is terminal
            if done:
                break

        return t, rewards[0:t]
Ejemplo n.º 5
0
    def run_episode(self, Q: Neural, task: Task, policy: Policy):

        # to compute backup
        rewards = np.zeros(self.episode_length, dtype=float)

        # initialize state
        state = task.initial_state()
        phi_state = self.phi(state)

        # repeat for each step of episode
        for t in range(self.episode_length):

            # choose action from state using policy derived from Q
            action = policy.act(Q, task, phi_state)

            # take action and observe reward and new state
            new_state, reward, done = task.transition(state, action)
            phi_new_state = self.phi(new_state)
            rewards[t] = reward

            # store the transition in memory
            self.memory.remember(phi_state, action, reward, phi_new_state,
                                 done)

            # sample a mini-batch of transitions from memory
            # compute the targets y_j and train the network
            mini_batch = self.memory.sample_batch()
            if mini_batch is not None:
                Q.train(mini_batch, self.gamma)

            # update state
            state, phi_state = new_state, phi_new_state

            # until state is terminal
            if done:
                break

        return t, rewards[0:t]
Ejemplo n.º 6
0
    def act(self, Q: Agent, task: Task, state):

        # set the number of actions of the current task, if not set
        if self.valid_actions == 0:
            self.valid_actions = task.valid_actions()

        # get the distribution over actions for the current state
        pref = self.preferences[state]

        # sample an action from the preference distribution
        action = np.random.choice(self.valid_actions, 1, p=pref)

        # get the greedy action according to Q
        greedy = Q.max_action(state)

        # update the preference distribution
        pref *= (1.0 - self.beta)
        pref[greedy] /= (1.0 - self.beta)
        pref[greedy] += self.beta * (1.0 - pref[greedy])

        return action
Ejemplo n.º 7
0
 def act(self, Q: Agent, task: Task, state):
     values = self.distribution(Q, task, state)
     return np.random.choice(task.valid_actions(), 1, p=values)