transitions.add((7, 'W', 7, 1)) mdp = MDP(states, acts, transitions) #Define rewards Rs = dict((s, 0) for s in states) Rs[1] = 0 Rs[2] = -10 Rs[3] = 1 Rs[4] = 0 Rs[5] = 1 Rs[6] = 1 Rs[7] = 2 R = dict(((s, a), Rs[s]) for (s, a, t) in mdp.transitions) V, policy = mdp.T_step_value_iteration(R) print V, policy # ## Abstracted MDP # states = {1,7} # acts = {'N','S','E','W'} # transitions = set() #transitions are: (s,a,s',p) # transitions.add((1,'N',1,1)) # transitions.add((1,'S',1,0.8)) # transitions.add((1,'S',7,0.2)) # transitions.add((1,'E',1,1)) # transitions.add((1,'W',1,1)) # transitions.add((7,'N',7,1)) # transitions.add((7,'S',7,1)) # transitions.add((7,'E',7,1)) # transitions.add((7,'W',7,1))
alphabet = [0,1,2,3] # North, south, west, east transitions = [] for s in states: for a in alphabet: for t in np.nonzero(gwg.prob[gwg.actlist[a]][s])[0]: p = gwg.prob[gwg.actlist[a]][s][t] transitions.append((s, alphabet.index(a), t, p)) mdp = MDP(states, set(alphabet),transitions) # V, goodpolicy = mdp.max_reach_prob(goodtargets, epsilon=0.0001) # V, badpolicy = mdp.max_reach_prob(badtargets, epsilon=0.0001) randomness = 0 R = dict([(s,a,next_s),0.0] for s in mdp.states for a in mdp.available(s) for next_s in mdp.post(s,a) ) R.update([(s,a,next_s),1.0] for s in mdp.states for a in mdp.available(s) for next_s in mdp.post(s,a) if next_s in goodtargets and s in goodtargets) V,goodpolicy = mdp.T_step_value_iteration(R,10) R = dict([(s,a,next_s),0.0] for s in mdp.states for a in mdp.available(s) for next_s in mdp.post(s,a) ) R.update([(s,a,next_s),1.0] for s in mdp.states for a in mdp.available(s) for next_s in mdp.post(s,a) if next_s in badtargets and s in badtargets) V,badpolicy = mdp.T_step_value_iteration(R,10) good_MC = mdp.construct_MC(goodpolicy,'Examples/7x5_good.txt') bad_MC = mdp.construct_MC(badpolicy,'Examples/7x5_bad.txt') # Construct product mdp states = [(s1,s2) for s1 in gwg.states for s2 in gwg.states] product_trans = [] for s1 in states: for s2 in states: for a in alphabet: p1 = gwg.prob[gwg.actlist[a]][s1[0]][s2[0]] p2 = bad_MC[(s1[1],s2[1])]
w = (1.0 / (gwg.nstates - len(ab[s]))) * randomness # tempdict = dict([(s, a, t),0.0] for t in states) for t in np.nonzero(gwg.prob[gwg.actlist[a]][s])[0]: p = gwg.prob[gwg.actlist[a]][s][t] transdict[(s, agentbehaviours.index(ab), t)] += p * w for t in states: transitions.append((s, agentbehaviours.index(ab), t, transdict[(s, agentbehaviours.index(ab), t)])) mdp1 = MDP(states, alphabet=range(2), transitions=transitions) R = dict([(s, a, next_s), 0.0] for s in mdp.states for a in mdp.available(s) for next_s in mdp.post(s, a)) R.update([(s, a, next_s), 1.0] for s in mdp.states for a in mdp.available(s) for next_s in mdp.post(s, a) if next_s in targets[0] and s not in targets[0]) V, policyT = mdp.T_step_value_iteration(R, T=20) policyT1 = dict([s, set(range(gwg.nactions))] for s in mdp.states for a in mdp.available(s)) agentbehaviours = [policyT1, policyT] transitions = [] for ab in agentbehaviours: for s in states: transdict = dict([(s, agentbehaviours.index(ab), t), 0.0] for t in states) for a in ab[s]: # tempdict = dict([(s, a, t),0.0] for t in states) for t in np.nonzero(gwg.prob[gwg.actlist[a]][s])[0]: p = gwg.prob[gwg.actlist[a]][s][t] transdict[(s, agentbehaviours.index(ab), t)] += p * 1.0 / len(ab[s]) for t in states: