class Simulation(object): def __init__(self,location,shape,plot): self.location = location self.shape = shape self.plot = plot self.env = Environment(self.shape) self.agent = Agent() self.child = Child(self.env.state) def run(self,duration): start = datetime.now() self.env.state = self.child.run() self.agent.on() if self.plot: save_figure(self.env.x,self.env.y,self.env.state,[self.agent.position[0]],[self.agent.position[1]]) while self.agent.active: self.agent.precepts(self.env.state) self.env.state = self.agent.effectors(self.env.state) if self.plot: save_figure(self.env.x,self.env.y,self.env.state,[self.agent.position[0]],[self.agent.position[1]]) else: sleep(0.5) if (datetime.now() - start).seconds >= duration: self.agent.gohome() self.agent.precepts(self.env.state) self.env.state = self.agent.effectors(self.env.state) if self.plot: save_figure(self.env.x,self.env.y,self.env.state,[self.agent.position[0]],[self.agent.position[1]]) else: sleep(0.5) rwd, clnd = self.agent.reward, self.agent.cleaned self.agent.reset() return rwd, clnd
# Objects agent1 = Agent() agent2 = Agent() # Initializations writer = SummaryWriter(comment="-q-iteration") iter_no = 0 # Q learning Algorithm profits = np.zeros((ITER_BREAK + 2, NUM_EPISODES + 2)) for ep in range(NUM_EPISODES): print(ep) # 1: initialise Qs env.reset() agent1.reset() agent2.reset() iter_no = 0 s_next = 0 while True: iter_no += 1 eps = 1 - np.exp(-BETA * (iter_no)) # 2: agents choose actions simultanously. action1 = agent1.act(eps) action2 = agent2.act(eps) action = action1 * nA + action2 # 3: outcomes are calculated s = s_next s_next, reward_n, done, prob = env.step(action) # 4: Bellman updates agent1.value_update(s, action1, reward_n[0], s_next)