def render(self, dpi=40): maze_record(self.render_count, 'Blocking task', self.state, 4, 7, self.ev.blockers_state, dpi=dpi) self.render_count += 1
def episode(self, tstart, rcd=False): state = self.env.reset_state() # action = self.boltzmannPolicy(state, self.β) # ns, reward, done = self.env.step( state, action ) # x = self.φ( ns ) x = self.encode(state) for t in range(1, 41): η = self.η0 * min(1, self.ηstart / (tstart + t)) self.β = np.power(self.βinit, (tstart + t) / self.βfrac) action = self.boltzmannPolicy(state, self.β) # Observe reward r_{t+1} and next state s_{t+1} next_state, reward, done = self.env.step(state, action) # Choose the next action a_{t+1} # next_action = self.boltzmannPolicy(next_state, self.β) # next_x, _, reward, done = self.φ(next_state, next_action, ns=True) # next_x = self.φ(next_state, next_action, ns=False) next_x = self.encode(next_state) if rcd: maze_record(tstart + t, 'Blocking task', next_state, 4, 7, self.env.blockers_state) V_x = self.V_rd(x) V_next_x = self.V_rd(next_x) Q_x = self.α + np.log(det(np.dot(V_x, V_x.T))) Q_next_x = self.α + np.log(det(np.dot(V_next_x, V_next_x.T))) TD = reward + self.ρ * Q_next_x - Q_x grad_Q = 2 * pinv(V_x).T # V update self.V_wr(x, η * TD * grad_Q) # self.α += η * TD self.total_reward += reward if (tstart + t) % self.bin == 0: self.rec_reward.append( (t + tstart, self.total_reward / self.bin)) self.total_reward = 0 if done: self.found = True # print('\n\n\n\nCompleted episode, t=',str(t),'\n\n\n') return t state = next_state # action = next_action x = next_x return 40
def render(self, rcd, s): if rcd: self.i += 1 maze_record(self.i, None, s, 4, 7, self.env.blockers_state, up=False, dpi=300)
def render(self, rcd, s): if rcd: self.i += 1 maze_record(self.i, 'Blocking task', s, 4, 7, self.env.blockers_state, up=False, dpi=100)
def episode(self, tstart, rcd=False): state = self.env.reset_state() for t in range(1, self.max_n_ep + 1): η = self.η0 * min(1, self.ηstart / (tstart + t)) self.β = np.power(self.β10e4, (tstart + t) / self.βfrac) # Choose an action a_t action = self.boltzmannPolicy(state, self.β) # Observe reward r_{t+1} and next state s_{t+1} next_state, reward, done = self.env.step(state, action) x = self.encode(next_state) max_Q = self.maxQ(next_state) if rcd: maze_record(tstart + t, 'Blocking task', next_state, 4, 7, self.env.blockers_state) V_x = self.V_rd(x) Q_x = self.α + np.log(det(np.dot(V_x, V_x.T))) TD = reward + self.ρ * max_Q - Q_x grad_Q = 2 * pinv(V_x).T # V update # print(self.α) # self.λ *= 0.9999 self.V_wr(x, η * TD * grad_Q) # - (V_x - self.ei_s(x)) * η * self.λ ) # self.α += η * TD# grad_α = 1 self.total_reward += reward if (tstart + t) % self.bin == 0: self.rec_reward.append( (t + tstart, self.total_reward / self.bin)) self.total_reward = 0 if done: self.found = True # print('\n\n\n\nCompleted episode, t=',str(t),'\n\n\n') return t state = next_state return self.max_n_ep
def episode(self, tstart, epLen=40, rec=False): state = self.env.reset_state() action = self.policy(state, ε=self.ε) # action = self.boltzmann(state) for t in range(1, epLen + 1): next_state, reward, done = self.env.step(state, action) next_action = self.policy(state, ε=self.ε) # next_action = self.boltzmann(state) if rec: maze_record(tstart + t, 'Blocking task, ' + str(done) + ', t=', next_state, 4, 7, self.env.blockers_state, up=False) # perform update update = self.α * (reward + self.γ * self.q(next_state, next_action) - self.q(state, action)) if (state, action) in self.Q: self.Q[(state, action)] += update else: self.Q[(state, action)] = update # record learning rate self.total_reward += reward if (tstart + t) % self.bin == 0: self.rec_reward.append( (tstart + t, self.total_reward / self.bin)) self.total_reward = 0 state, action = next_state, next_action if done: return t return epLen