def __init__(self): self.ev = BlockerTask() self.action_dim = len(self.ev.all_actions) # dr_i, dc_i, i in 1,2,3 self.state_dim = 6 # r_i, c_i, i in 1,2,3 self.state = self.reset() self.render_count = 0
def detSarsaNNModelMean(path, env="BlockerTask", repeats=5, steps=40000): env = BlockerTask() if env == "BlockerTask" else TransportationTask() rec_reward = {0: [-1 for i in range(repeats)]} for i in tqdm(range(repeats)): env = BlockerTask() m = NN_model() agent = DetSARSA_Model(env, m) rews = agent.run_no_rec(steps) rec_reward = updateRewardList(rec_reward, rews) rec_reward = computeMean(rec_reward) with open(path, 'w') as outfile: outfile.write(json.dumps(rec_reward))
class BlockerEnv(): def __init__(self): self.ev = BlockerTask() self.action_dim = len(self.ev.all_actions) # dr_i, dc_i, i in 1,2,3 self.state_dim = 6 # r_i, c_i, i in 1,2,3 self.state = self.reset() self.render_count = 0 def reset(self): self.state = self.ev.reset_state() return self.convert_state(self.state) def render(self, dpi=40): maze_record(self.render_count, 'Blocking task', self.state, 4, 7, self.ev.blockers_state, dpi=dpi) self.render_count += 1 def close(self): return def convert_state(self, s): ((r1, c1), (r2, c2), (r3, c3)) = s return np.array([r1, c1, r2, c2, r3, c3]) def step(self, a): # convert action from np to tuple action = self.ev.all_actions[ a] #((a[0],a[1]), (a[2],a[3]), (a[4],a[5])) next_state, reward, done = self.ev.step(self.state, action) self.state = next_state # convert state back from tuple to np next_state = self.convert_state(next_state) return next_state, reward, done, None def output_GIF(self): makeGIF('../plots/temp-plots/temp-plots1', '../plots/ActorCriticBlockerTaskGIF')
def sarsaNoBoltzStepMean(path, env="BlockerTask", repeats=5, steps=300000): env = BlockerTask() if env == "BlockerTask" else TransportationTask() rec_reward = {0: [-1 for i in range(repeats)]} for i in tqdm(range(repeats)): agent = SARSA_noBoltzStep(env, α=0.5, γ=0.9, ε=0.3) rews = agent.run(epLen=40, mxsteps=steps, rec_any=False) rec_reward = updateRewardList(rec_reward, rews) rec_reward = computeMean(rec_reward) with open(path, 'w') as outfile: outfile.write(json.dumps(rec_reward))
def QlearningMean(path, env="BlockerTask", repeats=5, steps=300000): env = BlockerTask() if env == "BlockerTask" else TransportationTask() rec_reward = {0: [-1 for i in range(repeats)]} for i in tqdm(range(repeats)): agent = Qlearning(env, 0.1) rews = agent.run(epLen=40, mxsteps=steps, rec_any=False) rec_reward = updateRewardList(rec_reward, rews) rec_reward = computeMean(rec_reward) with open(path, 'w') as outfile: outfile.write(json.dumps(rec_reward))
def myDetQlearningMean(path, env="BlockerTask", repeats=5, steps=40000): env = BlockerTask() if env == "BlockerTask" else TransportationTask() rec_reward = {0: [-1 for i in range(repeats)]} for i in tqdm(range(repeats)): agent = DeterminantalQlearning(env) rews = agent.run_no_rec(steps) rec_reward = updateRewardList(rec_reward, rews) rec_reward = computeMean(rec_reward) with open(path, 'w') as outfile: outfile.write(json.dumps(rec_reward))
return t # update variables state, action, s_idx, a_idx, nF = next_state, next_action, ns_idx, na_idx, nF print(state) # print(self.Wa) return self.max_n_ep def run(self, max_steps): self.rec_reward = [] self.total_reward = 0 t, ep = 0, 0 while t < max_steps: ep += 1 print('time: ' + str(t), flush=True) dt = self.episode(t, max_steps) t += dt return self.rec_reward env = BlockerTask() agent = ESARSA(env) results = agent.run(4e5) #print(agent.a_idx( ((0,1), (0,0), (0,-1)) )) #steps = 300000 #env = BlockerTask() #agent = ESARSA( env ) #rews = agent.run( steps )