Ejemplo n.º 1
0
 def render(self, dpi=40):
     maze_record(self.render_count,
                 'Blocking task',
                 self.state,
                 4,
                 7,
                 self.ev.blockers_state,
                 dpi=dpi)
     self.render_count += 1
Ejemplo n.º 2
0
    def episode(self, tstart, rcd=False):
        state = self.env.reset_state()
        # action = self.boltzmannPolicy(state, self.β)
        # ns, reward, done = self.env.step( state, action )
        # x = self.φ( ns )
        x = self.encode(state)

        for t in range(1, 41):
            η = self.η0 * min(1, self.ηstart / (tstart + t))
            self.β = np.power(self.βinit, (tstart + t) / self.βfrac)

            action = self.boltzmannPolicy(state, self.β)
            # Observe reward r_{t+1} and next state s_{t+1}
            next_state, reward, done = self.env.step(state, action)
            # Choose the next action a_{t+1}
            #            next_action = self.boltzmannPolicy(next_state, self.β)

            #            next_x, _, reward, done = self.φ(next_state, next_action, ns=True)
            #            next_x = self.φ(next_state, next_action, ns=False)
            next_x = self.encode(next_state)

            if rcd:
                maze_record(tstart + t, 'Blocking task', next_state, 4, 7,
                            self.env.blockers_state)

            V_x = self.V_rd(x)
            V_next_x = self.V_rd(next_x)

            Q_x = self.α + np.log(det(np.dot(V_x, V_x.T)))
            Q_next_x = self.α + np.log(det(np.dot(V_next_x, V_next_x.T)))

            TD = reward + self.ρ * Q_next_x - Q_x
            grad_Q = 2 * pinv(V_x).T

            # V update
            self.V_wr(x, η * TD * grad_Q)

            # self.α += η * TD

            self.total_reward += reward
            if (tstart + t) % self.bin == 0:
                self.rec_reward.append(
                    (t + tstart, self.total_reward / self.bin))
                self.total_reward = 0

            if done:
                self.found = True
                #                print('\n\n\n\nCompleted episode, t=',str(t),'\n\n\n')
                return t

            state = next_state
            #            action = next_action
            x = next_x

        return 40
Ejemplo n.º 3
0
 def render(self, rcd, s):
     if rcd:
         self.i += 1
         maze_record(self.i,
                     None,
                     s,
                     4,
                     7,
                     self.env.blockers_state,
                     up=False,
                     dpi=300)
Ejemplo n.º 4
0
 def render(self, rcd, s):
     if rcd:
         self.i += 1
         maze_record(self.i,
                     'Blocking task',
                     s,
                     4,
                     7,
                     self.env.blockers_state,
                     up=False,
                     dpi=100)
Ejemplo n.º 5
0
    def episode(self, tstart, rcd=False):
        state = self.env.reset_state()

        for t in range(1, self.max_n_ep + 1):
            η = self.η0 * min(1, self.ηstart / (tstart + t))
            self.β = np.power(self.β10e4, (tstart + t) / self.βfrac)

            # Choose an action a_t
            action = self.boltzmannPolicy(state, self.β)
            # Observe reward r_{t+1} and next state s_{t+1}
            next_state, reward, done = self.env.step(state, action)
            x = self.encode(next_state)

            max_Q = self.maxQ(next_state)

            if rcd:
                maze_record(tstart + t, 'Blocking task', next_state, 4, 7,
                            self.env.blockers_state)

            V_x = self.V_rd(x)
            Q_x = self.α + np.log(det(np.dot(V_x, V_x.T)))

            TD = reward + self.ρ * max_Q - Q_x
            grad_Q = 2 * pinv(V_x).T

            # V update
            #            print(self.α)
            #            self.λ *= 0.9999
            self.V_wr(x,
                      η * TD * grad_Q)  # - (V_x - self.ei_s(x)) * η * self.λ )

            #            self.α += η * TD# grad_α = 1

            self.total_reward += reward
            if (tstart + t) % self.bin == 0:
                self.rec_reward.append(
                    (t + tstart, self.total_reward / self.bin))
                self.total_reward = 0

            if done:
                self.found = True
                #                print('\n\n\n\nCompleted episode, t=',str(t),'\n\n\n')
                return t

            state = next_state

        return self.max_n_ep
Ejemplo n.º 6
0
    def episode(self, tstart, epLen=40, rec=False):
        state = self.env.reset_state()
        action = self.policy(state, ε=self.ε)
        # action = self.boltzmann(state)

        for t in range(1, epLen + 1):
            next_state, reward, done = self.env.step(state, action)
            next_action = self.policy(state, ε=self.ε)
            # next_action = self.boltzmann(state)

            if rec:
                maze_record(tstart + t,
                            'Blocking task, ' + str(done) + ', t=',
                            next_state,
                            4,
                            7,
                            self.env.blockers_state,
                            up=False)

            # perform update
            update = self.α * (reward +
                               self.γ * self.q(next_state, next_action) -
                               self.q(state, action))
            if (state, action) in self.Q:
                self.Q[(state, action)] += update
            else:
                self.Q[(state, action)] = update

            # record learning rate
            self.total_reward += reward
            if (tstart + t) % self.bin == 0:
                self.rec_reward.append(
                    (tstart + t, self.total_reward / self.bin))
                self.total_reward = 0

            state, action = next_state, next_action

            if done: return t
        return epLen