def modified_policy_iteration(mdp: MDP, gamma: float, epsilon: float, k: int = 5) -> Tuple[Dict, Dict]: random_a = random.sample(mdp.A, 1)[0] pi = {s: random_a for s in mdp.S} V = {s: 0. for s in mdp.S} while True: for i in range(k): for s in mdp.S: V[s] = mdp.R(s, pi[s]) + gamma * sum([ mdp.P(s_prime, s, pi[s]) * V[s_prime] for s_prime in mdp.S ]) delta = 0. for s in mdp.S: V_old = V[s] V_new = { a: mdp.R(s, a) + gamma * sum([mdp.P(s_prime, s, a) * V[s_prime] for s_prime in mdp.S]) for a in mdp.A } pi[s] = max(V_new, key=V_new.get) V[s] = max(V_new.values()) delta = max(delta, abs(V[s] - V_old)) if delta <= epsilon: break return pi, V
class Robot(): def __init__(self): rospy.init_node("robot") self.config = read_config() self.astar_pub = rospy.Publisher("/results/path_list", AStarPath, queue_size=10) self.sim_complete_pub = rospy.Publisher("/map_node/sim_complete", Bool, queue_size=10) # publish A* path self.path_array = [] self.publish_astar() # publish MDP print "running mdp" self.mdp = MDP() self.mdp.make_policy() rospy.sleep(1) self.sim_complete_pub.publish(True) rospy.sleep(1) rospy.signal_shutdown(Robot) def publish_astar(self): obj = Astar() obj.astar_func(self.path_array) for i in range(len(self.path_array)): rospy.sleep(1) msg = AStarPath() msg.data = self.path_array[i] self.astar_pub.publish(msg)
def __init__(self, env, gamma=.99): grid = EnvMDP.to_grid_matrix(env) reward = {} states = set() self.rows = len(grid) self.cols = len(grid[0]) self.grid = grid for x in range(self.cols): for y in range(self.rows): if grid[y][x] is not None: states.add((x, y)) reward[(x, y)] = grid[y][x] self.states = states terminals = EnvMDP.to_position(env, letter=b'GH') actlist = list(range(env.action_space.n)) transitions = EnvMDP.to_transitions(env) init = EnvMDP.to_position(env, letter=b'S')[0] MDP.__init__(self, init, actlist=actlist, terminals=terminals, transitions=transitions, reward=reward, states=states, gamma=gamma)
def recommend_pathway(user_jobs, job_graph, goal_state, min_likelihood_thr): """ Recommend a pathway, given the sequence of job titles. """ user_jobs_for_mdp = [user_jobs[0]] mdp = MDP(job_graph, user_jobs_for_mdp, goal_state, min_likelihood_thr=min_likelihood_thr) return mdp.solve_mdp()
def update(): data = json.loads(request.data) size = data['size'] state_rewards_list = data['state_rewards_list'] state_rewards_dict = {tuple(k):v for k,v in state_rewards_list} blocked_states_list = [tuple(s) for s in data['blocked_states_list']] discount = data['discount'] started = data['started'] values = np.array(data['values']) policy = np.array(data['policy']) print(blocked_states_list) if started: mdp = MDP(state_rewards_dict, blocked_states_list, discount, size, values, policy) else: mdp = MDP(state_rewards_dict, blocked_states_list, discount, size) table = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list) return json.dumps({'table': table, 'values': mdp.values.tolist(), 'policy': mdp.policy.tolist()})
def random_vs_sarsa(): plt.clf() # sarsa sarsa = np.zeros(num_episodes+1) a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=True, stochastic_wind=True) epsilon, alpha = 0.1, 0.5 for seed in range(num_trials): episodes = a.sarsa(seed, num_episodes, epsilon, alpha) sarsa += np.array(episodes) sarsa = np.cumsum(sarsa)/num_trials # random walk random_walk = np.zeros(num_episodes+1) for seed in range(num_trials): episodes = a.random_walk(seed, num_episodes) random_walk += np.array(episodes) random_walk = np.cumsum(random_walk)/num_trials plt.clf() y = np.arange(num_episodes+1) plt.plot(sarsa, y, label='sarsa') plt.plot(random_walk, y, label='random walk') plt.xlabel("Time step") plt.ylabel("Episodes") plt.title("Sarsa(0) agent with 8 moves, stochastic wind") plt.grid(True) plt.legend() # plt.show() plt.savefig("plots/random_walk.png")
def task_5(): plt.clf() a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=False, stochastic_wind=False) epsilon, alpha = 0.1, 0.5 episodes_avg_t = np.zeros((3, num_episodes+1)) for seed in range(num_trials): # sarsa episodes = a.sarsa(seed, num_episodes, epsilon, alpha) episodes_avg_t[0, :] += np.cumsum(np.array(episodes)) # q-learning episodes = a.q_learning(seed, num_episodes, epsilon, alpha) episodes_avg_t[1, :] += np.cumsum(np.array(episodes)) # expected sarsa episodes = a.expected_sarsa(seed, num_episodes, epsilon, alpha) episodes_avg_t[2, :] += np.cumsum(np.array(episodes)) # normalise episodes_avg_t /= num_trials # y axis y = np.arange(num_episodes+1) plt.title("Comparison of various algos\n4 moves, no stochastic wind") plt.plot(episodes_avg_t[0], y, label='Sarsa') plt.plot(episodes_avg_t[1], y, label='Q-Learning') plt.plot(episodes_avg_t[2], y, label='Expected Sarsa') plt.xlabel("Time step") plt.ylabel("Episodes") plt.grid(True) plt.legend() # plt.show() plt.savefig("plots/t5.png")
def part_iii_evaluation(sim_filename): print sim_filename mdp = MDP("blank_2_actions_81_states_mdp.txt") results = [] # prior: assume each transition seen once transition_count = [[[0.1 for _ in range(81)] for _ in range(81)] for _ in range(2)] for n in range(10): print "Big loop " + str(n) results.append([]) for i in range(100): mdp, transition_count = adp_rl(mdp, Sim(MDP(sim_filename)), transition_count) value_fn, policy, iterations = plan(mdp, 0.99, 0.01) print "Value: " + str(value_fn) print "Policy: " + str(policy) #print "Reward: " + str(mdp.rewards) #print "Transitions: " + str(mdp.transitions) for i in range(100): reward = run_policy(Sim(MDP(sim_filename)), policy) results[n].append(reward) print "Average reward of policy: " + str(average(results[n])) for l in results: print average(l)
def modelBasedRL(self, s0, defaultT, initialR, nEpisodes, nSteps, epsilon=0): '''Model-based Reinforcement Learning with epsilon greedy exploration. This function should use value iteration, policy iteration or modified policy iteration to update the policy at each step Inputs: s0 -- initial state defaultT -- default transition function when a state-action pair has not been vsited initialR -- initial estimate of the reward function nEpisodes -- # of episodes (one episode consists of a trajectory of nSteps that starts in s0 nSteps -- # of steps per episode epsilon -- probability with which an action is chosen at random Outputs: V -- final value function policy -- final policy ''' # temporary values to ensure that the code compiles until this # function is coded count_triple = np.ones( [self.mdp.nActions, self.mdp.nStates, self.mdp.nStates]) cumu_reward_lst = np.zeros(nEpisodes) V = np.zeros(self.mdp.nStates) policy = np.zeros(self.mdp.nStates, int) mdp_tmp = MDP(defaultT, initialR, self.mdp.E, self.mdp.discount) for iterEp in range(nEpisodes): state = s0 for iterSt in range(nSteps): action = 0 if np.random.rand(1) < epsilon: action = np.random.randint(self.mdp.nActions) else: action = policy[state] [nextState, reward, done] = self.sampleRewardAndNextState(state, action) cumu_reward_lst[iterEp] += self.mdp.discount**iterSt * reward count_triple[action, state, nextState] += 1 count_double = np.sum(count_triple[action, state, :]) mdp_tmp.T[action, state, :] = count_triple[action, state, :] / count_double mdp_tmp.R[action, state] = (reward + (count_double - 1) * mdp_tmp.R[action, state]) / count_double [policy, V, iterId] = mdp_tmp.policyIteration(policy) state = nextState if (done): break return [V, policy, cumu_reward_lst]
def __init__(self, grid, goalVals, discount=.99, tau=.01, epsilon=.001): MDP.__init__(self, discount=discount, tau=tau, epsilon=epsilon) self.goalVals = goalVals self.grid = grid self.setGridWorld() self.valueIteration() self.extractPolicy()
def __init__(self, grid, goalVals, discount=.99, tau=.01, epsilon=.001): MDP.__init__(self, discount=discount, tau=tau, epsilon=epsilon) self.goalVals = goalVals self.grid = grid self.setGridWorld() self.valueIteration() self.extractPolicy()
def __init__(self, hist_duration, mdp_step, time_step, action_size, batch_size, mean, std, hdg0, src_file, sim_time): self.mdp = MDP(hist_duration, mdp_step, time_step) self.action_size = action_size self.agent = PolicyLearner(self.mdp.size, action_size, batch_size) self.agent.load(src_file) self.wh = wind(mean, std, int(mdp_step / time_step)) self.hdg0 = hdg0 self.src = src_file self.sim_time = sim_time
def runMDP(self): mdp = MDP(self.config) print "TESTING" while mdp.renameThis(): #print "Iterate" result_policy = mdp.iterate() #print "publish" #print "Will Continue?", mdp.renameThis() print result_policy self.policies = result_policy util.print_2d_map(self.grid)
def __init__(self): self._states = 1 #standaardwaarde, wordt later aangepast eens de omgeving bekend is self._mdp = MDP(1) self._qvalues = np.zeros((self._states, 4)) self._vvalues = np.zeros(self._states) self._policy = np.ones((self._states, 4)) / 4 self._learningRate = 0.8 self._epsilonDecay = -0.005 self._epsilon = 1.0 self._epsilonMin = 0.01 self._epsilonMax = 1.0 self._count = 0
def task_2(): # simple 4 move res = np.zeros(num_episodes+1) a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=False, stochastic_wind=False) epsilon, alpha = 0.1, 0.5 for seed in range(num_trials): episodes = a.sarsa(seed, num_episodes, epsilon, alpha) res += np.cumsum(np.array(episodes)) res /= num_trials plot_one(res, 'Sarsa(0), 4 move agent', 't2.png') return res
def task_4(): # stochastic wind res = np.zeros(num_episodes+1) a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=True, stochastic_wind=True) epsilon, alpha = 0.1, 0.5 for seed in range(num_trials): episodes = a.sarsa(seed, num_episodes, epsilon, alpha) res += np.array(episodes) res = np.cumsum(res)/num_trials plot_one(res, 'Sarsa(0), 8 move agent, stochastic wind', 't4.png') return res
def __init__(self, desc=None, map_name="4x4", slip_chance=0.2): if desc is None and map_name is None: raise ValueError('Must provide either desc or map_name') elif desc is None: desc = self.MAPS[map_name] assert ''.join(desc).count('S') == 1, "this implementation supports having exactly one initial state" assert all(c in "SFHG" for c in ''.join(desc)), "all cells must be either of S, F, H or G" self.desc = desc = np.asarray(list(map(list,desc)),dtype='str') self.lastaction = None nrow, ncol = desc.shape states = [(i, j) for i in range(nrow) for j in range(ncol)] actions = ["left","down","right","up"] initial_state = states[np.array(desc == b'S').ravel().argmax()] def move(row, col, movement): if movement== 'left': col = max(col-1,0) elif movement== 'down': row = min(row+1,nrow-1) elif movement== 'right': col = min(col+1,ncol-1) elif movement== 'up': row = max(row-1,0) else: raise("invalid action") return (row, col) transition_probs = {s : {} for s in states} rewards = {s : {} for s in states} for (row,col) in states: if desc[row, col] in "GH": continue for action_i in range(len(actions)): action = actions[action_i] transition_probs[(row, col)][action] = {} rewards[(row, col)][action] = {} for movement_i in [(action_i - 1) % len(actions), action_i, (action_i + 1) % len(actions)]: movement = actions[movement_i] newrow, newcol = move(row, col, movement) prob = (1. - slip_chance) if movement == action else (slip_chance / 2.) if prob == 0: continue if (newrow, newcol) not in transition_probs[row,col][action]: transition_probs[row,col][action][newrow, newcol] = prob else: transition_probs[row, col][action][newrow, newcol] += prob if desc[newrow, newcol] == 'G': rewards[row,col][action][newrow, newcol] = 1.0 MDP.__init__(self, transition_probs, rewards, initial_state)
def graph_decay_score(scale, rand=False): """ Function to generate a graph for the exponential decay score over a range of k :param scale: the limit to which k should vary :param rand: to use a random policy or not :return: None """ fig = plt.figure() x = [i + 1 for i in range(scale)] y_decay = [] for i in x: rs = MDP(path='data-mini', k=i) if rand: rs.initialise_mdp() y_decay.append(rs.evaluate_decay_score()) continue rs.load('mdp-model_k=' + str(i) + '.pkl') y_decay.append(rs.evaluate_decay_score()) plt.bar(x, y_decay, width=0.5, color=(0.2, 0.4, 0.6, 0.6)) xlocs = [i + 1 for i in range(0, 10)] for i, v in enumerate(y_decay): plt.text(xlocs[i] - 0.46, v + 0.9, '%.2f' % v) plt.xticks(x) plt.yticks([i for i in range(0, 100, 10)]) fig.suptitle( 'Avg Exponential Decay Score vs Number of items in each state') plt.xlabel('K') plt.ylabel('Score') plt.show()
def run(resolution, knn, lookahead, gamma, episodes, render=False): env = gym.make("MountainCar-v0") env_unwrapped = env.unwrapped discretizer = Discretizer(resolution, resolution, knn) print(f"Discretizing at resoluton {resolution}, knn {knn}.") S, A, P, R = discretizer(env_unwrapped, zero_reward_if_done=True) mdp = MDP(S, A, P, R, 200, gamma) print(f"Running value iteration with lookahead {lookahead}.") V, pi, vi_iterations = value_iteration(mdp, lookahead=lookahead) steps = [] for _ in range(episodes): observation = env.reset() for t in count(1): if render: env.render() d_obs = discretizer.discretize_state(observation) action = pi[np.random.choice(list(d_obs.keys()), p=list(d_obs.values()))] observation, _, done, _ = env.step(action) if done: steps.append(t) break env.close() print(f"Average steps over {episodes} episodes: {np.mean(steps)}.") return vi_iterations, V, pi, steps
def __init__(self, initial, nrows=8, ncols=8, nagents=1, targets=[], obstacles=[], moveobstacles=[], regions=dict(), preferred_acts=set()): # walls are the obstacles. The edges of the gridworld will be included into the walls. # region is a string and can be one of: ['pavement','gravel', 'grass', 'sand'] self.current = initial self.nrows = nrows self.ncols = ncols self.nagents = nagents self.nstates = nrows * ncols self.nactions = 5 self.regions = regions self.actlist = ['N', 'S', 'W', 'E', 'R'] self.targets = targets self.left_edge = [] self.right_edge = [] self.top_edge = [] self.bottom_edge = [] self.obstacles = obstacles self.moveobstacles = moveobstacles self.states = range(nrows * ncols) self.colorstates = set() for x in range(self.nstates): # note that edges are not disjoint, so we cannot use elif if x % self.ncols == 0: self.left_edge.append(x) if 0 <= x < self.ncols: self.top_edge.append(x) if x % self.ncols == self.ncols - 1: self.right_edge.append(x) if (self.nrows - 1) * self.ncols <= x <= self.nstates: self.bottom_edge.append(x) self.edges = self.left_edge + self.top_edge + self.right_edge + self.bottom_edge self.walls = self.edges + obstacles self.prob = { a: np.zeros((self.nstates, self.nstates)) for a in self.actlist } self.probOfSuccess = dict([]) self.getProbRegions() for s in self.states: for a in self.actlist: self.getProbs(s, a) transitions = set() for s in self.states: for a in self.actlist: for t in np.nonzero( self.prob[self.actlist[self.actlist.index(a)]][s])[0]: p = self.prob[self.actlist[self.actlist.index(a)]][s][t] transitions.add((s, a, t, p)) self.mdp = MDP(self.states, self.actlist, transitions)
def main(userNum = '59945701'): agent = MDP(path = 'data-mini', k = 3) #Create instance for MDP class agent.initializeMDP() #Initialize States, Actions, Probabilites and initial Rewards rewardEvolution = agent.policyIteration() # The algorithm that solves MDP recommendation = agent.recommend(userNum) #Use recommendation function evaluationRS = agent.evaluateRecommendationScore() #Evaluation score evaluationED = agent.evaluateDecayScore() #Another evaluation score return recommendation, evaluationRS, evaluationED, userNum, rewardEvolution
def Mazes_generator(self,batch_size): Mazes = [] for MzIter in range(batch_size): [T,R,E] = maze_generator() mdp = MDP(T,R,E,self.rl.mdp.discount) rlSample = RL(mdp,np.random.normal) Mazes.append(rlSample) return Mazes
def async_value_iteration(mdp: MDP, gamma: float, num_iterations: int = 1000) -> Tuple[Dict, Dict]: Q = {(s, a): 0. for a in mdp.A for s in mdp.S} for i in range(num_iterations): s = random.sample(mdp.S, 1)[0] a = random.sample(mdp.A, 1)[0] Q[(s, a)] = mdp.R(s, a) + gamma * sum([ mdp.P(s_prime, s, a) * max([Q[(s_prime, a_prime)] for a_prime in mdp.A]) for s_prime in mdp.S ]) pi = {} for s in mdp.S: values = {a: Q[(s, a)] for a in mdp.A} pi[s] = max(values, key=values.get) return pi, Q
def graph_recommendation_score(scale=4, m=10, with_comparison=False): """ Function to generate a graph for the recommendation score over a range of m for a set of k :param scale: the limit to which k should vary :param m: a parameter in recommendation score computation :param with_comparison: plot a random policy's graph :return: None """ fig = plt.figure() k = [i + 1 for i in range(1, scale)] x = [i + 1 for i in range(m)] for j in k: y_recommendation = [] y_recommendation_rand = [] rs = MDP(path='data-mini', k=j) rs.load('mdp-model_k=' + str(j) + '.pkl') for i in x: if with_comparison: rs.initialise_mdp() y_recommendation_rand.append( rs.evaluate_recommendation_score(m=i)) y_recommendation.append(rs.evaluate_recommendation_score(m=i)) plt.plot(x, y_recommendation, color=(0.2 + (j - 2) * 0.4, 0.4, 0.6, 0.6), label="MC model " + str(j)) plt.scatter(x, y_recommendation, color=(0.2 + (j - 2) * 0.4, 0.4, 0.6, 0.6)) if with_comparison: plt.plot(x, y_recommendation_rand, color=(0.2, 0.8, 0.6, 0.6), label="Random model, For m=" + str(m)) plt.scatter(x, y_recommendation_rand) plt.xticks(x) plt.yticks([i for i in range(20, 100, 10)]) for x1, y in zip(x, y_recommendation): text = '%.2f' % y plt.text(x1, y, text) if with_comparison: for x1, y in zip(x, y_recommendation_rand): text = '%.2f' % y plt.text(x1, y, text) fig.suptitle('Recommendation Score vs Prediction List size') plt.xlabel('Prediction List size') plt.ylabel('Score') plt.legend() plt.show()
def beginSimulation(self): result_astar = astar(self.config) self.publishAStar(result_astar) mdp = MDP(self.config) #INPUT A GRID TODO print "TESTING" while mdp.renameThis(): print "Iterate" result_policy = mdp.iterate() print "publish" self.resultsPolicyPub.publish(result_policy) print "Will Continue?", mdp.renameThis() self.simulationCompletePub.publish(True) rospy.sleep(10) rospy.signal_shutdown("Simulation has Completed")
def run_experiment3(grid, fleet, horizon): # solve mdp and run a simulation # returns profile of the simulation mdp = MDP(fleet, grid, horizon, get_prices_func=deterministic_prices) profile_mdp_simulation(mdp, "out/experiment3_coordinated.csv") mdp = UncoordinatedMDP(fleet, grid, horizon, get_prices_func=deterministic_prices) profile_mdp_simulation(mdp, "out/experiment3_uncoordinated.csv")
def search_exe(self): Astar() #self.path_pub.publish(path) MDP() QL() self.finish_pub.publish(True) rospy.sleep(10) rospy.signal_shutdown("Finish Simulation")
def __init__(self, num_positions=500, num_orientations=10): # TODO: Interface with SLAM algorithm's published map # Initialize map. rospy.init_node( "neato_mdp") # May break if markov_model is also subscribed...? rospy.wait_for_service("static_map") static_map = rospy.ServiceProxy("static_map", GetMap) # Initialize MDP self.mdp = MDP(num_positions=num_positions, num_orientations=num_orientations, map=static_map().map) self.state_idx = None # Current state idx is unknown. self.curr_odom_pose = Pose() self.tf_helper = TFHelper() # Velocity publisher self.cmd_vel_publisher = rospy.Publisher("/cmd_vel", Twist, queue_size=10, latch=True) self.odom_subscriber = rospy.Subscriber('/odom', Odometry, self.set_odom) self.goal_state = None # Visualize robot self.robot_state_pub = rospy.Publisher('/robot_state_marker', Marker, queue_size=10) self.robot_state_pose_pub = rospy.Publisher('/robot_state_pose', PoseArray, queue_size=10) self.goal_state_pub = rospy.Publisher('/goal_state_marker', Marker, queue_size=10) # # pose_listener responds to selection of a new approximate robot location (for instance using rviz) # self.odom_pose = PoseStamped() self.odom_pose.header.stamp = rospy.Time(0) self.odom_pose.header.frame_id = 'odom' # rospy.Subscriber("initialpose", PoseWithCovarianceStamped, self.update_initial_pose) rospy.Subscriber("move_base_simple/goal", PoseStamped, self.update_goal_state)
def __init__(self, rows, cols, definitiveness, initstate, terminals, obstacles, gamma=.9): self.rows = rows self.cols = cols self.definitiveness = definitiveness self.initstate = initstate self.terminals = terminals self.obstacles = obstacles stateset = set() for y in range(1, self.cols + 1): for x in range(1, self.rows + 1): stateset.add((x, y)) actionset = {'up', 'down', 'right', 'left'} MDP.__init__(self, stateset, actionset, gamma)
def policy_iteration_step(): data = json.loads(request.data) size = data['size'] state_rewards_list = data['state_rewards_list'] state_rewards_dict = {tuple(k):v for k,v in state_rewards_list} blocked_states_list = [tuple(s) for s in data['blocked_states_list']] discount = data['discount'] values = np.array(data['values']) policy = np.array(data['policy']) mdp = MDP(state_rewards_dict, blocked_states_list, discount, size, values, policy) mdp.values = mdp.evaluate_policy_values() table = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list) return json.dumps({'table': table, 'values': mdp.values.tolist(), 'policy': mdp.policy.tolist()})
def run_experiment2(line_bound, horizon, output_file): # increasing number of vehicles ev_s0 = {} profit_increase_rate = {1: 1} pre_processing_time = {} processing_time = {} average_reward = {} error_reward = {} initial_time = time.time() grid = Grid.create_tree_grid(high=TREE_HIGH, branch_factor=TREE_BRANCHING_FACTOR, line_bound=line_bound) grid_initialization_time = time.time() - initial_time for num_vehicles in range(1, MAX_NUMBER_OF_CARS + 1): pos_vehicles = [i % (grid.n_nodes - 1) + 1 for i in range(num_vehicles)] fleet = init_ev_fleet(4, pos_vehicles, horizon) grid.save_to_dot_file_with_fleet(fleet, "grids/grid_experiment2_fleet{}.dot".format(num_vehicles)) mdp = MDP(fleet, grid, horizon, get_prices_func=deterministic_prices) results = mdp.solve_get_stats() print(num_vehicles, results) pre_processing_time[num_vehicles] = results["Feasible actions computational time"] + grid_initialization_time processing_time[num_vehicles] = results["Optimization time"] ev_s0[num_vehicles] = results["Expected value initial state"] average_reward[num_vehicles] = results["average_reward"] error_reward[num_vehicles] = results["error"] if num_vehicles > 1: profit_increase_rate[num_vehicles] = (ev_s0[num_vehicles] - ev_s0[num_vehicles - 1]) / ev_s0[1] data_frame = pd.DataFrame.from_dict( {"Expected value": ev_s0, "Profit increase rate": profit_increase_rate, "Processing time": processing_time, "Average reward": average_reward, "error_reward": error_reward, "Preprocessing time": pre_processing_time } ) data_frame.to_csv(output_file)
def index(): size = 10 state_rewards_dict = {(6,6):1, (0,0):1} blocked_states_list = [(2, 3), (1, 3), (0, 3), (4, 8), (5, 8), (6, 8), (5, 2), (6, 2), (7, 2), (8, 2), (8, 3), (8, 4)] discount=.9 mdp = MDP(state_rewards_dict, blocked_states_list, discount, size=size) table = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list) state_rewards_list = [[list(k),v] for k,v in state_rewards_dict.items()] return render_template("index.html", table=table, size=size, state_rewards_list=state_rewards_list, blocked_states_list=[list(s) for s in blocked_states_list], discount=discount, values=mdp.values.tolist(), policy=mdp.policy.tolist())
print print "=== {} ===".format(name) print "non-stationary value function:" print_value_function(V) print print "policy:" print_policy(policy) print print "============================================================" print print # PROBLEM 1 # load MDP debug mdp = MDP() mdp.load_from_file('MDP_debug.txt') # run finite horizon value iteration H = 10 (V, policy) = MDPOptimization.finite_horizon_value_iteration(mdp, H) print_helper(V, policy, "MDP Debug") # PROBLEM 2 # load custom MDP mdp = MDP() mdp.load_from_file('MDP_custom.txt') # run finite horizon value iteration H = 10
def create_mdp_three_directions(width, height): """ Create the grid world MDP without a reward function defined. This has 3 directions of movement possible. Parameters: width -- The width of the grid world. height -- The height of the grid world. Returns: The MDP object with states, actions, and transitions, but no reward. """ mdp = MDP() # Create the states. for x in range(width): for y in range(height): mdp.S |= {(x, y)} # Create the actions. mdp.A = {"n", "s", "e", "w"} # Create the transition probabilities. for s in mdp.S: for a in mdp.A: for sp in mdp.S: mdp.P[(s, a, sp)] = 0.0 for sx, sy in mdp.S: if sy > 0: mdp.P[((sx, sy), "n", (sx, sy - 1))] = 0.8 if sx == 0: mdp.P[((sx, sy), "n", (sx + 1, sy))] = 0.2 elif sx == width - 1: mdp.P[((sx, sy), "n", (sx - 1, sy))] = 0.2 else: mdp.P[((sx, sy), "n", (sx - 1, sy))] = 0.1 mdp.P[((sx, sy), "n", (sx + 1, sy))] = 0.1 else: if sx == 0: mdp.P[((sx, sy), "n", (sx, sy))] = 0.9 mdp.P[((sx, sy), "n", (sx + 1, sy))] = 0.1 elif sx == width - 1: mdp.P[((sx, sy), "n", (sx, sy))] = 0.9 mdp.P[((sx, sy), "n", (sx - 1, sy))] = 0.1 else: mdp.P[((sx, sy), "n", (sx, sy))] = 0.8 mdp.P[((sx, sy), "n", (sx - 1, sy))] = 0.1 mdp.P[((sx, sy), "n", (sx + 1, sy))] = 0.1 if sy < height - 1: mdp.P[((sx, sy), "s", (sx, sy + 1))] = 0.8 if sx == 0: mdp.P[((sx, sy), "s", (sx + 1, sy))] = 0.2 elif sx == width - 1: mdp.P[((sx, sy), "s", (sx - 1, sy))] = 0.2 else: mdp.P[((sx, sy), "s", (sx - 1, sy))] = 0.1 mdp.P[((sx, sy), "s", (sx + 1, sy))] = 0.1 else: if sx == 0: mdp.P[((sx, sy), "s", (sx, sy))] = 0.9 mdp.P[((sx, sy), "s", (sx + 1, sy))] = 0.1 elif sx == width - 1: mdp.P[((sx, sy), "s", (sx, sy))] = 0.9 mdp.P[((sx, sy), "s", (sx - 1, sy))] = 0.1 else: mdp.P[((sx, sy), "s", (sx, sy))] = 0.8 mdp.P[((sx, sy), "s", (sx - 1, sy))] = 0.1 mdp.P[((sx, sy), "s", (sx + 1, sy))] = 0.1 if sx > 0: mdp.P[((sx, sy), "w", (sx - 1, sy))] = 0.8 if sy == 0: mdp.P[((sx, sy), "w", (sx, sy + 1))] = 0.2 elif sy == height - 1: mdp.P[((sx, sy), "w", (sx, sy - 1))] = 0.2 else: mdp.P[((sx, sy), "w", (sx, sy - 1))] = 0.1 mdp.P[((sx, sy), "w", (sx, sy + 1))] = 0.1 else: if sy == 0: mdp.P[((sx, sy), "w", (sx, sy))] = 0.9 mdp.P[((sx, sy), "w", (sx, sy + 1))] = 0.1 elif sy == height - 1: mdp.P[((sx, sy), "w", (sx, sy))] = 0.9 mdp.P[((sx, sy), "w", (sx, sy - 1))] = 0.1 else: mdp.P[((sx, sy), "w", (sx, sy))] = 0.8 mdp.P[((sx, sy), "w", (sx, sy - 1))] = 0.1 mdp.P[((sx, sy), "w", (sx, sy + 1))] = 0.1 if sx < width - 1: mdp.P[((sx, sy), "e", (sx + 1, sy))] = 0.8 if sy == 0: mdp.P[((sx, sy), "e", (sx, sy + 1))] = 0.2 elif sy == height - 1: mdp.P[((sx, sy), "e", (sx, sy - 1))] = 0.2 else: mdp.P[((sx, sy), "e", (sx, sy - 1))] = 0.1 mdp.P[((sx, sy), "e", (sx, sy + 1))] = 0.1 else: if sy == 0: mdp.P[((sx, sy), "e", (sx, sy))] = 0.9 mdp.P[((sx, sy), "e", (sx, sy + 1))] = 0.1 elif sy == height - 1: mdp.P[((sx, sy), "e", (sx, sy))] = 0.9 mdp.P[((sx, sy), "e", (sx, sy - 1))] = 0.1 else: mdp.P[((sx, sy), "e", (sx, sy))] = 0.8 mdp.P[((sx, sy), "e", (sx, sy - 1))] = 0.1 mdp.P[((sx, sy), "e", (sx, sy + 1))] = 0.1 return mdp
def print_parking_helper(V, policy, mdp, name): print "============================================================" print print "=== {} ===".format(name) print "value function:" print_parking_value_function(V, mdp) print print "policy:" print_parking_policy(policy, mdp) print print "============================================================" print print ### PROBLEM 2 mdp = MDP() # load MDP1 mdp.load_from_file('MDP1.txt') epsilon = 0.000001 # run infinite horizon value iteration and policy iteration beta = 0.1 (V, policy) = InfiniteHorizonPolicyOptimization.value_iteration(mdp, beta, epsilon) print_helper(V, policy, "MDP1 value iteration, beta={}, epsilon={}".format(beta, epsilon)) (V, policy) = InfiniteHorizonPolicyOptimization.policy_iteration(mdp, beta) print_helper(V, policy, "MDP1 policy iteration, beta={}".format(beta)) # run infinite horizon value iteration and policy iteration beta = 0.9