class Dagger(): def __init__(self, grid, mdp, moves=40): self.grid = grid self.mdp = mdp self.svm = LinearSVM(grid, mdp) self.net = Net(grid,mdp) self.moves = moves #self.reward = np.zeros(40) self.super_pi = mdp.pi self.reward = np.zeros(self.moves) self.animate = False self.record = True self.recent_rollout_states = None def rollout(self): self.grid.reset_mdp() self.reward = np.zeros(self.moves) self.recent_rollout_states = [self.mdp.state] self.mistakes = 0.0 for t in range(self.moves): if self.record: assert self.super_pi.desc == ClassicPolicy.desc self.net.add_datum(self.mdp.state, self.super_pi.get_next(self.mdp.state)) #Get current state and action x_t = self.mdp.state a_t = self.mdp.pi.get_next(x_t) self.compare_policies(x_t, a_t) #Take next step self.grid.step(self.mdp) x_t_1 = self.mdp.state #Evaualte reward recieved self.reward[t] = self.grid.reward(x_t,a_t,x_t_1) self.recent_rollout_states.append(self.mdp.state) if(self.animate): self.grid.show_recording() #print self.svm.data def compare_policies(self, x, a): if self.super_pi.get_next(x) != a: self.mistakes += 1 def get_states(self): return self.net.get_states() def get_reward(self): return np.sum(self.reward) def set_supervisor_pi(self, pi): self.super_pi = pi def get_loss(self): return float(self.mistakes) / float(self.moves) def get_recent_rollout_states(self): N = len(self.recent_rollout_states) states = np.zeros([N,2]) for i in range(N): x = self.recent_rollout_states[i].toArray() states[i,:] = x return states def retrain(self): self.net.fit() self.mdp.pi = NetPolicy(self.net)
class NSupervise(): def __init__(self, grid, mdp, moves=40,net = 'Net'): self.grid = grid self.mdp = mdp self.net_name = net self.svm = LinearSVM(grid, mdp) self.net = Net(grid,mdp,net,T=moves) self.moves = moves #self.reward = np.zeros(40) self.super_pi = mdp.pi self.mdp.pi_noise = False self.reward = np.zeros(self.moves) self.animate = False self.train_loss = 0 self.test_loss = 0 self.record = True def rollout(self): self.grid.reset_mdp() self.reward = np.zeros(self.moves) for t in range(self.moves): a = self.super_pi.get_next(self.mdp.state) #print "action ",a #Get current state and action x_t = self.mdp.state a_t = self.mdp.pi.get_next(x_t) #Take next step a_taken = self.grid.step(self.mdp) print "action taken ", a_taken print "timestep ", t if(self.record): if(self.net_name == 'UB'): self.net.add_datum(x_t, a,a_taken) else: self.net.add_datum(x_t,a) x_t_1 = self.mdp.state #Evaualte reward recieved self.reward[t] = self.grid.reward(x_t,a_t,x_t_1) if(self.animate): self.grid.show_recording() #print self.svm.data def sample_policy(self): self.record = True self.net.clear_data() def get_states(self): return self.net.get_states() def get_weights(self): return self.net.get_weights() def get_reward(self): return np.sum(self.reward) def set_supervisor_pi(self, pi): self.super_pi = pi def train(self): self.net.fit() stats = self.net.return_stats() self.train_loss = stats[0] self.test_loss = stats[1] self.mdp.pi_noise = False self.mdp.pi = NetPolicy(self.net) self.record = False def get_train_loss(self): return self.train_loss def get_test_loss(self): return self.test_loss