def __init__(self, states, actions, observations, observation_function, transition_function, reward_function, belief_points, gamma): POMDP.__init__(self, states, actions, observations, observation_function, transition_function, reward_function, gamma) self.belief_points = belief_points self.t = 0 self.compute_gamma_reward()
def __init__(self, states, actions, observations, observation_function, transition_function, reward_function, gamma, pruning_beliefs): POMDP.__init__(self, states, actions, observations, observation_function, transition_function, reward_function, gamma) self.compute_gamma_reward() self.t = 0 self.pruning_beliefs = pruning_beliefs
def __init__(self, states, actions, observations, observation_function, transition_function, reward_function, gamma, initial_belief, c=0.5): POMDP.__init__(self, states, actions, observations, observation_function, transition_function, reward_function, gamma) self.tree = Tree() self.initial_belief = initial_belief self.c = c self.a_selected = None
def __init__(self, num_players, values, T_fail=0, decay_rate=1, discount_factor=.5, num_round=25, search_max=120, depth=2, win_value=0): self.decay = decay_rate self.discount = discount_factor self.N = num_players self.values = values self.depth = depth self.win_value = win_value alpha_max = search_max + num_round * (self.N + 1) beta_max = search_max + num_round * (self.N + 1) self.T = np.zeros([num_round - 1, alpha_max, beta_max, 2]) self.V = np.zeros([alpha_max, beta_max, num_round ]) ## Value function (maximum expected reward) self.Q = np.zeros([alpha_max, beta_max, num_round, 2]) ## maximum Expected reward of each action self.P = np.zeros([alpha_max, beta_max, num_round]) ## Policy function (best aciton) if depth == 2: self.preQ1 = POMDP.loadPomdpQVals(num_players, [1.5, 2.5], T_fail, decay_rate, discount_factor, num_round, search_max, win_value) self.preQ0 = POMDP.loadPomdpQVals(num_players, [2.5, 1.5], T_fail, decay_rate, discount_factor, num_round, search_max, win_value) else: self.preQ1 = IPOMDP.loadPomdpQVals(num_players, [1.5, 2.5], T_fail, decay_rate, discount_factor, num_round, search_max, depth - 1, win_value) self.preQ0 = IPOMDP.loadPomdpQVals(num_players, [2.5, 1.5], T_fail, decay_rate, discount_factor, num_round, search_max, depth - 1, win_value) self.findAllValues()
def MinMDPTest(): score = 0 f = open('tests/MinMDP', 'r') contents = [x.strip() for x in f.readlines() if (not (x.isspace()))] i = 0 while i < len(contents): line = contents[i] if line.startswith('#'): print (line) elif line.startswith('Environment'): model_name = line.split()[1] model_file = 'examples/env/' + model_name + '.pomdp' print ('Environment:', model_name) pomdp = POMDP(model_file) min_mdp = MinMDP(pomdp, .01) elif line.startswith('Belief'): pieces = [x for x in line.split() if (x.find(':') == -1)] belief = np.array([float(x) for x in pieces]) print ('Belief =', belief) elif line.startswith('Value'): value = float(line.split()[1]) ans_value = min_mdp.getValue(belief) print ("Value by MinMDP:" , value , "Your answer:", ans_value) elif line.startswith('Action'): action = int(line.split()[1]) ans_action = min_mdp.chooseAction(belief) print ("Action by MinMDP:" , action , "Your answer:", ans_action) if abs(ans_value - value) < .01 and action == ans_action: score += 1 print ("PASS") else: print ("FAIL") elif line.startswith('Runs'): num_runs = int(line.split()[1]) ans_total_reward = min_mdp.evaluate(num_runs) elif line.startswith('Reward'): total_reward = float(line.split()[1]) print ("Reward by MinMDP:" , total_reward , "Your answer:", ans_total_reward) elif line.startswith('Error'): error = float(line.split()[1]) if abs(total_reward - ans_total_reward) < error: score +=1 print ("PASS") else: print ("FAIL") else: raise Exception("Unrecognized line: " + line) i +=1 print ("Total score out of 3:", score) return score
def AEMS2Test(): score = 0 f = open('tests/AEMS2', 'r') contents = [x.strip() for x in f.readlines() if (not (x.isspace()))] i = 0 maxStart = time.time() print("start time", maxStart) while i < len(contents): line = contents[i] if line.startswith('#'): print (line) elif line.startswith('Environment'): model_name = line.split()[1] print ('Environment:', model_name) model_file = 'examples/env/' + model_name + '.pomdp' pomdp = POMDP(model_file) qmdp = QMDP(pomdp, .01) m_mdp = MinMDP(pomdp, .01) elif line.startswith('Time'): time_limit = float(line.split()[1]) print ("Time limit (sec):" , time_limit) elif line.startswith('Runs'): start = time.time() print("start time ", i, "=", start) num_runs = int(line.split()[1]) sum_reward = 0 for run in range(num_runs): solver = AEMS2(pomdp, m_mdp, qmdp, .01, time_limit) sum_reward += OnlineSolver.solve(solver) ans_total_reward = sum_reward / num_runs print (ans_total_reward) elif line.startswith('Reward'): total_reward = float(line.split()[1]) print ("Minimum requried reward:" , total_reward , "Your answer:", ans_total_reward) if total_reward <= ans_total_reward: score +=4 print ("PASS") else: print ("FAIL") print("time: ", i, "=", time.time() - start) else: raise Exception("Unrecognized line: " + line) i +=1 print("time:", time.time() - maxStart) print ("Total score out of 8:", score) return score
good_MC = mdp.construct_MC(goodpolicy,'Examples/7x5_good.txt') bad_MC = mdp.construct_MC(badpolicy,'Examples/7x5_bad.txt') # Construct product mdp states = [(s1,s2) for s1 in gwg.states for s2 in gwg.states] product_trans = [] for s1 in states: for s2 in states: for a in alphabet: p1 = gwg.prob[gwg.actlist[a]][s1[0]][s2[0]] p2 = bad_MC[(s1[1],s2[1])] if p1*p2>0: product_trans.append((s1,a,s2,p1*p2)) product_mdp = MDP(states, set(alphabet),product_trans) product_pomdp = POMDP(product_mdp,gwg) product_mdp.write_to_file('Examples/7x5_productmdp_bad',(30,4)) product_pomdp.write_to_file('Examples/7x5_productpomdp_bad',(30,4)) # Construct product mdp states = [(s1,s2) for s1 in gwg.states for s2 in gwg.states] product_trans2 = [] for s1 in states: for s2 in states: for a in alphabet: p1 = gwg.prob[gwg.actlist[a]][s1[0]][s2[0]] p2 = good_MC[(s1[1],s2[1])] if p1*p2>0: product_trans2.append((s1,a,s2,p1*p2)) product_mdp2 = MDP(states, set(alphabet),product_trans2)
time_horizon = 20 pz = 0.7 pu3 = 0.8 lava_cost_f = -100 lava_cost_b = -50 door_cost = 100 u3_cost = -1 samples = 1000 planner = POMDP(pz, pu3, lava_cost_f, lava_cost_b, door_cost, u3_cost, samples, prune=True) for T in range(time_horizon): obs = planner.observe() pred = planner.predict() #print(pred) book_lines = np.array([[door_cost, lava_cost_f], [lava_cost_b, door_cost]]) plt.figure() plt.plot([0, 1], book_lines.T, 'r--') plt.plot([planner.prob_turn_start, planner.prob_turn_start], [-500, 500], 'g:') plt.plot([planner.prob_turn_end, planner.prob_turn_end], [-500, 500], 'g:')
from pomdp import POMDP from environment import Environment from onlineSolver import OnlineSolver from offlineSolver import OfflineSolver from policyReader import PolicyReader from aems import AEMS2 from mdpSolver import QMDP, MinMDP import sys if len(sys.argv) == 5: #offline solver model_name = sys.argv[2] model_file = 'examples/env/' + model_name + '.pomdp' pomdp = POMDP(model_file) num_runs = int(sys.argv[3]) precision = float(sys.argv[4]) if sys.argv[1] == "QMDP": solver = QMDP(pomdp, precision) elif sys.argv[1] == "MinMDP": solver = MinMDP(pomdp, precision) else: raise Exception("Invalid offline solver: ", sys.argv[1]) print("Average reward: ", solver.evaluate(num_runs)) elif len(sys.argv) == 8: #online solver if sys.argv[1] != "AEMS2": raise Exception("Invalid online solver: ", sys.argv[1])
#!/usr/bin/env python3 from pomdp import POMDP import matplotlib.pyplot as plt # ------------------------------------------------------------------ # Summary: # Example of implementation of the partially observable markov decision process class for a simplified two-state robot model if __name__ == "__main__": pomdp = POMDP() # MDP algorithm object pomdp.CreateValueMap() pomdp.Play() plt.show()