def markovDecision(layout, circle): env = SnakesAndLadder(layout, circle) agent = RandomAgent(env.action_space) n_episodes = 50 for episode in range(n_episodes): state = env.reset() done = False while not done: action = agent.select_action(state) next_state, reward, done = env.step(action) agent.update(state, action, reward, next_state) state = next_state
possible_actions = [0, 1] # Cooperate or Defect cooperator, defector = RandomAgent(possible_actions, p=0.9), RandomAgent(possible_actions, p=0.1) # Stateless interactions (agents do not have memory) s = None n_iter = 1000 for i in range(n_iter): # A full episode: done = False while not done: # Agents decide a0 = cooperator.act() a1 = defector.act() # World changes new_s, (r0, r1), done, _ = env.step(([a0], [a1])) # Agents learn cooperator.update(s, (a0, a1), (r0, r1), new_s ) defector.update(s, (a1, a0), (r1, r0), new_s ) s = new_s print(r0, r1) env.reset()