def part_iii_evaluation(sim_filename): print sim_filename mdp = MDP("blank_2_actions_81_states_mdp.txt") results = [] # prior: assume each transition seen once transition_count = [[[0.1 for _ in range(81)] for _ in range(81)] for _ in range(2)] for n in range(10): print "Big loop " + str(n) results.append([]) for i in range(100): mdp, transition_count = adp_rl(mdp, Sim(MDP(sim_filename)), transition_count) value_fn, policy, iterations = plan(mdp, 0.99, 0.01) print "Value: " + str(value_fn) print "Policy: " + str(policy) #print "Reward: " + str(mdp.rewards) #print "Transitions: " + str(mdp.transitions) for i in range(100): reward = run_policy(Sim(MDP(sim_filename)), policy) results[n].append(reward) print "Average reward of policy: " + str(average(results[n])) for l in results: print average(l)
def update(): data = json.loads(request.data) size = data['size'] state_rewards_list = data['state_rewards_list'] state_rewards_dict = {tuple(k):v for k,v in state_rewards_list} blocked_states_list = [tuple(s) for s in data['blocked_states_list']] discount = data['discount'] started = data['started'] values = np.array(data['values']) policy = np.array(data['policy']) print(blocked_states_list) if started: mdp = MDP(state_rewards_dict, blocked_states_list, discount, size, values, policy) else: mdp = MDP(state_rewards_dict, blocked_states_list, discount, size) table = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list) return json.dumps({'table': table, 'values': mdp.values.tolist(), 'policy': mdp.policy.tolist()})
def value_iteration(): size = 5 state_rewards_dict = {(3,3):1, (0,0):2} blocked_states_list = [(2,3), (2,4), (2,2)] discount=.9 mdp = MDP(state_rewards_dict, blocked_states_list, discount, size=size) table1 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list) mdp.values = mdp.evaluate_values() table2 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list) mdp.values = mdp.evaluate_values() table3 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list) mdp.values = mdp.evaluate_values() table4 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list) mdp.value_iteration() table5 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list) value_table6 = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list) policy_table6 = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list, show_policy=True) size = 10 state_rewards_dict = {(6,6):1, (0,0):1} blocked_states_list = [(2, 3), (1, 3), (0, 3), (4, 8), (5, 8), (6, 8), (5, 2), (6, 2), (7, 2), (8, 2), (8, 3), (8, 4)] discount=.9 mdp = MDP(state_rewards_dict, blocked_states_list, discount, size=size) table = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list) state_rewards_list = [[list(k),v] for k,v in state_rewards_dict.items()] return render_template("value_iteration.html", table1=table1, table2=table2, table3=table3, table4=table4, table5=table5, value_table6=value_table6, policy_table6=policy_table6, table=table, size=size, state_rewards_list=state_rewards_list, blocked_states_list=[list(s) for s in blocked_states_list], discount=discount, values=mdp.values.tolist(), policy=mdp.policy.tolist())
def recommend_pathway(user_jobs, job_graph, goal_state, min_likelihood_thr): """ Recommend a pathway, given the sequence of job titles. """ user_jobs_for_mdp = [user_jobs[0]] mdp = MDP(job_graph, user_jobs_for_mdp, goal_state, min_likelihood_thr=min_likelihood_thr) return mdp.solve_mdp()
def graph_decay_score(scale, rand=False): """ Function to generate a graph for the exponential decay score over a range of k :param scale: the limit to which k should vary :param rand: to use a random policy or not :return: None """ fig = plt.figure() x = [i + 1 for i in range(scale)] y_decay = [] for i in x: rs = MDP(path='data-mini', k=i) if rand: rs.initialise_mdp() y_decay.append(rs.evaluate_decay_score()) continue rs.load('mdp-model_k=' + str(i) + '.pkl') y_decay.append(rs.evaluate_decay_score()) plt.bar(x, y_decay, width=0.5, color=(0.2, 0.4, 0.6, 0.6)) xlocs = [i + 1 for i in range(0, 10)] for i, v in enumerate(y_decay): plt.text(xlocs[i] - 0.46, v + 0.9, '%.2f' % v) plt.xticks(x) plt.yticks([i for i in range(0, 100, 10)]) fig.suptitle( 'Avg Exponential Decay Score vs Number of items in each state') plt.xlabel('K') plt.ylabel('Score') plt.show()
def run(resolution, knn, lookahead, gamma, episodes, render=False): env = gym.make("MountainCar-v0") env_unwrapped = env.unwrapped discretizer = Discretizer(resolution, resolution, knn) print(f"Discretizing at resoluton {resolution}, knn {knn}.") S, A, P, R = discretizer(env_unwrapped, zero_reward_if_done=True) mdp = MDP(S, A, P, R, 200, gamma) print(f"Running value iteration with lookahead {lookahead}.") V, pi, vi_iterations = value_iteration(mdp, lookahead=lookahead) steps = [] for _ in range(episodes): observation = env.reset() for t in count(1): if render: env.render() d_obs = discretizer.discretize_state(observation) action = pi[np.random.choice(list(d_obs.keys()), p=list(d_obs.values()))] observation, _, done, _ = env.step(action) if done: steps.append(t) break env.close() print(f"Average steps over {episodes} episodes: {np.mean(steps)}.") return vi_iterations, V, pi, steps
def random_vs_sarsa(): plt.clf() # sarsa sarsa = np.zeros(num_episodes+1) a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=True, stochastic_wind=True) epsilon, alpha = 0.1, 0.5 for seed in range(num_trials): episodes = a.sarsa(seed, num_episodes, epsilon, alpha) sarsa += np.array(episodes) sarsa = np.cumsum(sarsa)/num_trials # random walk random_walk = np.zeros(num_episodes+1) for seed in range(num_trials): episodes = a.random_walk(seed, num_episodes) random_walk += np.array(episodes) random_walk = np.cumsum(random_walk)/num_trials plt.clf() y = np.arange(num_episodes+1) plt.plot(sarsa, y, label='sarsa') plt.plot(random_walk, y, label='random walk') plt.xlabel("Time step") plt.ylabel("Episodes") plt.title("Sarsa(0) agent with 8 moves, stochastic wind") plt.grid(True) plt.legend() # plt.show() plt.savefig("plots/random_walk.png")
def task_5(): plt.clf() a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=False, stochastic_wind=False) epsilon, alpha = 0.1, 0.5 episodes_avg_t = np.zeros((3, num_episodes+1)) for seed in range(num_trials): # sarsa episodes = a.sarsa(seed, num_episodes, epsilon, alpha) episodes_avg_t[0, :] += np.cumsum(np.array(episodes)) # q-learning episodes = a.q_learning(seed, num_episodes, epsilon, alpha) episodes_avg_t[1, :] += np.cumsum(np.array(episodes)) # expected sarsa episodes = a.expected_sarsa(seed, num_episodes, epsilon, alpha) episodes_avg_t[2, :] += np.cumsum(np.array(episodes)) # normalise episodes_avg_t /= num_trials # y axis y = np.arange(num_episodes+1) plt.title("Comparison of various algos\n4 moves, no stochastic wind") plt.plot(episodes_avg_t[0], y, label='Sarsa') plt.plot(episodes_avg_t[1], y, label='Q-Learning') plt.plot(episodes_avg_t[2], y, label='Expected Sarsa') plt.xlabel("Time step") plt.ylabel("Episodes") plt.grid(True) plt.legend() # plt.show() plt.savefig("plots/t5.png")
def __init__(self, initial, nrows=8, ncols=8, nagents=1, targets=[], obstacles=[], moveobstacles=[], regions=dict(), preferred_acts=set()): # walls are the obstacles. The edges of the gridworld will be included into the walls. # region is a string and can be one of: ['pavement','gravel', 'grass', 'sand'] self.current = initial self.nrows = nrows self.ncols = ncols self.nagents = nagents self.nstates = nrows * ncols self.nactions = 5 self.regions = regions self.actlist = ['N', 'S', 'W', 'E', 'R'] self.targets = targets self.left_edge = [] self.right_edge = [] self.top_edge = [] self.bottom_edge = [] self.obstacles = obstacles self.moveobstacles = moveobstacles self.states = range(nrows * ncols) self.colorstates = set() for x in range(self.nstates): # note that edges are not disjoint, so we cannot use elif if x % self.ncols == 0: self.left_edge.append(x) if 0 <= x < self.ncols: self.top_edge.append(x) if x % self.ncols == self.ncols - 1: self.right_edge.append(x) if (self.nrows - 1) * self.ncols <= x <= self.nstates: self.bottom_edge.append(x) self.edges = self.left_edge + self.top_edge + self.right_edge + self.bottom_edge self.walls = self.edges + obstacles self.prob = { a: np.zeros((self.nstates, self.nstates)) for a in self.actlist } self.probOfSuccess = dict([]) self.getProbRegions() for s in self.states: for a in self.actlist: self.getProbs(s, a) transitions = set() for s in self.states: for a in self.actlist: for t in np.nonzero( self.prob[self.actlist[self.actlist.index(a)]][s])[0]: p = self.prob[self.actlist[self.actlist.index(a)]][s][t] transitions.add((s, a, t, p)) self.mdp = MDP(self.states, self.actlist, transitions)
def Mazes_generator(self,batch_size): Mazes = [] for MzIter in range(batch_size): [T,R,E] = maze_generator() mdp = MDP(T,R,E,self.rl.mdp.discount) rlSample = RL(mdp,np.random.normal) Mazes.append(rlSample) return Mazes
def experiment_low_price_prob(fleet, grid, horizon, price_transition, step_size=0.01): results = {} simulation_results = {} for p in np.arange(0.0, 1 + step_size, step_size): mdp = MDP(fleet, grid, horizon, get_prices_func=get_prices, price_transition_func=price_transition(p)) policy, expected_val = mdp.value_iteration() results[p] = expected_val[0][0] mdp = MDP(fleet, grid, horizon, get_prices_func=get_prices, price_transition_func=get_history_dependent_price_transition_func(p)) new_results = mdp.run_simulations(policy=policy, initial_state=0, repetitions=5000) simulation_results[p] = new_results["average_reward"] return results, simulation_results
def run_experiment3(grid, fleet, horizon): # solve mdp and run a simulation # returns profile of the simulation mdp = MDP(fleet, grid, horizon, get_prices_func=deterministic_prices) profile_mdp_simulation(mdp, "out/experiment3_coordinated.csv") mdp = UncoordinatedMDP(fleet, grid, horizon, get_prices_func=deterministic_prices) profile_mdp_simulation(mdp, "out/experiment3_uncoordinated.csv")
def search_exe(self): Astar() #self.path_pub.publish(path) MDP() QL() self.finish_pub.publish(True) rospy.sleep(10) rospy.signal_shutdown("Finish Simulation")
def modelBasedRL(self, s0, defaultT, initialR, nEpisodes, nSteps, epsilon=0): '''Model-based Reinforcement Learning with epsilon greedy exploration. This function should use value iteration, policy iteration or modified policy iteration to update the policy at each step Inputs: s0 -- initial state defaultT -- default transition function when a state-action pair has not been vsited initialR -- initial estimate of the reward function nEpisodes -- # of episodes (one episode consists of a trajectory of nSteps that starts in s0 nSteps -- # of steps per episode epsilon -- probability with which an action is chosen at random Outputs: V -- final value function policy -- final policy ''' # temporary values to ensure that the code compiles until this # function is coded count_triple = np.ones( [self.mdp.nActions, self.mdp.nStates, self.mdp.nStates]) cumu_reward_lst = np.zeros(nEpisodes) V = np.zeros(self.mdp.nStates) policy = np.zeros(self.mdp.nStates, int) mdp_tmp = MDP(defaultT, initialR, self.mdp.E, self.mdp.discount) for iterEp in range(nEpisodes): state = s0 for iterSt in range(nSteps): action = 0 if np.random.rand(1) < epsilon: action = np.random.randint(self.mdp.nActions) else: action = policy[state] [nextState, reward, done] = self.sampleRewardAndNextState(state, action) cumu_reward_lst[iterEp] += self.mdp.discount**iterSt * reward count_triple[action, state, nextState] += 1 count_double = np.sum(count_triple[action, state, :]) mdp_tmp.T[action, state, :] = count_triple[action, state, :] / count_double mdp_tmp.R[action, state] = (reward + (count_double - 1) * mdp_tmp.R[action, state]) / count_double [policy, V, iterId] = mdp_tmp.policyIteration(policy) state = nextState if (done): break return [V, policy, cumu_reward_lst]
def main(userNum = '59945701'): agent = MDP(path = 'data-mini', k = 3) #Create instance for MDP class agent.initializeMDP() #Initialize States, Actions, Probabilites and initial Rewards rewardEvolution = agent.policyIteration() # The algorithm that solves MDP recommendation = agent.recommend(userNum) #Use recommendation function evaluationRS = agent.evaluateRecommendationScore() #Evaluation score evaluationED = agent.evaluateDecayScore() #Another evaluation score return recommendation, evaluationRS, evaluationED, userNum, rewardEvolution
def graph_recommendation_score(scale=4, m=10, with_comparison=False): """ Function to generate a graph for the recommendation score over a range of m for a set of k :param scale: the limit to which k should vary :param m: a parameter in recommendation score computation :param with_comparison: plot a random policy's graph :return: None """ fig = plt.figure() k = [i + 1 for i in range(1, scale)] x = [i + 1 for i in range(m)] for j in k: y_recommendation = [] y_recommendation_rand = [] rs = MDP(path='data-mini', k=j) rs.load('mdp-model_k=' + str(j) + '.pkl') for i in x: if with_comparison: rs.initialise_mdp() y_recommendation_rand.append( rs.evaluate_recommendation_score(m=i)) y_recommendation.append(rs.evaluate_recommendation_score(m=i)) plt.plot(x, y_recommendation, color=(0.2 + (j - 2) * 0.4, 0.4, 0.6, 0.6), label="MC model " + str(j)) plt.scatter(x, y_recommendation, color=(0.2 + (j - 2) * 0.4, 0.4, 0.6, 0.6)) if with_comparison: plt.plot(x, y_recommendation_rand, color=(0.2, 0.8, 0.6, 0.6), label="Random model, For m=" + str(m)) plt.scatter(x, y_recommendation_rand) plt.xticks(x) plt.yticks([i for i in range(20, 100, 10)]) for x1, y in zip(x, y_recommendation): text = '%.2f' % y plt.text(x1, y, text) if with_comparison: for x1, y in zip(x, y_recommendation_rand): text = '%.2f' % y plt.text(x1, y, text) fig.suptitle('Recommendation Score vs Prediction List size') plt.xlabel('Prediction List size') plt.ylabel('Score') plt.legend() plt.show()
def __init__(self, hist_duration, mdp_step, time_step, action_size, batch_size, mean, std, hdg0, src_file, sim_time): self.mdp = MDP(hist_duration, mdp_step, time_step) self.action_size = action_size self.agent = PolicyLearner(self.mdp.size, action_size, batch_size) self.agent.load(src_file) self.wh = wind(mean, std, int(mdp_step / time_step)) self.hdg0 = hdg0 self.src = src_file self.sim_time = sim_time
def runMDP(self): mdp = MDP(self.config) print "TESTING" while mdp.renameThis(): #print "Iterate" result_policy = mdp.iterate() #print "publish" #print "Will Continue?", mdp.renameThis() print result_policy self.policies = result_policy util.print_2d_map(self.grid)
def task_2(): # simple 4 move res = np.zeros(num_episodes+1) a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=False, stochastic_wind=False) epsilon, alpha = 0.1, 0.5 for seed in range(num_trials): episodes = a.sarsa(seed, num_episodes, epsilon, alpha) res += np.cumsum(np.array(episodes)) res /= num_trials plot_one(res, 'Sarsa(0), 4 move agent', 't2.png') return res
def task_4(): # stochastic wind res = np.zeros(num_episodes+1) a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=True, stochastic_wind=True) epsilon, alpha = 0.1, 0.5 for seed in range(num_trials): episodes = a.sarsa(seed, num_episodes, epsilon, alpha) res += np.array(episodes) res = np.cumsum(res)/num_trials plot_one(res, 'Sarsa(0), 8 move agent, stochastic wind', 't4.png') return res
def __init__(self): self._states = 1 #standaardwaarde, wordt later aangepast eens de omgeving bekend is self._mdp = MDP(1) self._qvalues = np.zeros((self._states, 4)) self._vvalues = np.zeros(self._states) self._policy = np.ones((self._states, 4)) / 4 self._learningRate = 0.8 self._epsilonDecay = -0.005 self._epsilon = 1.0 self._epsilonMin = 0.01 self._epsilonMax = 1.0 self._count = 0
def part_ii_evaluation(): random_results_1 = [] safe_results_1 = [] range_results_1 = [] random_results_2 = [] safe_results_2 = [] range_results_2 = [] for i in range(1000): print i random_results_1.append( random_policy(Sim(MDP("parking_mdp_linear_rewards_n_10.txt")))) random_results_2.append( random_policy(Sim(MDP("parking_mdp_quad_rewards_n_10.txt")))) safe_results_1.append( safe_policy(Sim(MDP("parking_mdp_linear_rewards_n_10.txt")), 0.5)) safe_results_2.append( safe_policy(Sim(MDP("parking_mdp_quad_rewards_n_10.txt")), 0.5)) range_results_1.append( range_policy(Sim(MDP("parking_mdp_linear_rewards_n_10.txt")), 2, 8)) range_results_2.append( range_policy(Sim(MDP("parking_mdp_quad_rewards_n_10.txt")), 2, 6)) print average(random_results_1) print average(safe_results_1) print average(range_results_1) print average(random_results_2) print average(safe_results_2) print average(range_results_2)
def policy_iteration_example(): size = 5 state_rewards_dict = {(3,3):1, (0,0):1} blocked_states_list = [(2,3), (2,4), (2,2)] discount=.9 np.random.seed(443209) policy = np.random.randint(0,4, size=size**2) mdp = MDP(state_rewards_dict, blocked_states_list, discount, size=size, policy=policy) value_table1 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list) policy_table1 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list, show_policy=True) mdp.values = mdp.evaluate_policy_values() intermediate_table = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list) mdp.policy_evaluation() value_table2 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list) mdp.policy_improvement() policy_table2 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list, show_policy=True) mdp.policy_evaluation() value_table3 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list) mdp.policy_improvement() policy_table3 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list, show_policy=True) mdp.policy_iteration() value_table4 = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list) policy_table4 = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list, show_policy=True) state_rewards_list = [[list(k),v] for k,v in state_rewards_dict.items()] return render_template("policy_iteration_example.html", value_table1=value_table1, policy_table1=policy_table1, intermediate_table=intermediate_table, value_table2=value_table2, policy_table2=policy_table2, value_table3=value_table3, policy_table3=policy_table3, value_table4=value_table4, policy_table4=policy_table4, size=size, state_rewards_list=state_rewards_list, blocked_states_list=[list(s) for s in blocked_states_list], discount=discount, values=mdp.values.tolist(), policy=mdp.policy.tolist())
def generate_model(self): """ Method to generate and save the various models. :return: None """ # Generate models whose n-gram values change from 1...k for i in range(1, self.k+1): # Initialise the MDP mm = MDP(path=self.path, alpha=self.alpha, k=i, discount_factor=self.df, verbose=self.verbose, save_path=self.save_path) mm.initialise_mdp() # Run the policy iteration and save the model mm.policy_iteration(max_iteration=1000)
def create_environment_dynamic(n_lines, m_columns, n_walls, sr_plus, sr_less, s_initial, discount): World.x = m_columns World.y = n_lines amb = empty([n_lines, m_columns], dtype=object) s = sr_plus.split(sep=".") amb[int(s[0]), int(s[1])] = "+1" World.specials.append((int(s[0]), int(s[1]), "green", 1)) s = sr_less.split(sep=".") amb[int(s[0]), int(s[1])] = "-1" World.specials.append((int(s[0]), int(s[1]), "red", -1)) s = s_initial.split(sep=".") amb[int(s[0]), int(s[1])] = "I" World.player = (int(s[0]), int(s[1])) World.initial_position = (int(s[0]), int(s[1])) while n_walls > 0: line = randint(0, n_lines - 1) col = randint(0, m_columns - 1) if amb[line, col] is None: amb[line, col] = "X" World.walls.append((line, col)) n_walls -= 1 for i in range(n_lines): for j in range(m_columns): if amb[i, j] is None or amb[i, j] == 'I': amb[i, j] = " " states = create_states_dynamic(amb, -0.04) print(amb) up = create_up(states) down = create_down(states) right = create_right(states) left = create_left(states) actions = [up, down, left, right] World.render_grid() World.create_player() # World.start_game() return MDP(states, actions, discount)
def get_mdp(request): data = json.loads(request.data) size = data['size'] state_rewards_list = data['state_rewards_list'] state_rewards_dict = {tuple(k):v for k,v in state_rewards_list} blocked_states_list = [tuple(s) for s in data['blocked_states_list']] discount = data['discount'] values = np.array(data['values']) policy = np.array(data['policy']) mdp = MDP(state_rewards_dict, blocked_states_list, discount, size, values, policy) return mdp
def __init__(self): self._mdp = MDP() self.policy3 = np.ones((16, 4)) / 4 self._evaluation = Evaluation(mdp=self._mdp, policy=self.policy3) self._epsilon = 1.0 self._epsilonMin = 0.01 self._epsilonMax = 1.0 self._epsilonDecay = -0.005 self.policy2 = np.ones((16, 4)) / 4 self.count = 0 print("Rewards:") self.print_rewards(4, 4) print() # initial policy print("initial policy:") self.print_policy(4, 4) self.steps = 0
def create_abstractMDP(mdp, aggregation): states = set(aggregation.keys()) abstrans = dict() abstrans.update({(s, a): set() for s in aggregation.keys() for a in mdp.alphabet}) for absstate in states: for s in aggregation[absstate]: for a in mdp.available(s): for t in mdp.post(s, a): for s2 in aggregation.keys(): if t in aggregation[s2]: abstrans[absstate, a].add(s2) abstransprobs = set() for (s, a) in abstrans.keys(): for t in abstrans[(s, a)]: abstransprobs.add((s, a, t, 1.0 / len(abstrans[(s, a)]))) absmdp = MDP(states, mdp.alphabet, abstransprobs) return absmdp
def __init__(self, num_positions=500, num_orientations=10): # TODO: Interface with SLAM algorithm's published map # Initialize map. rospy.init_node( "neato_mdp") # May break if markov_model is also subscribed...? rospy.wait_for_service("static_map") static_map = rospy.ServiceProxy("static_map", GetMap) # Initialize MDP self.mdp = MDP(num_positions=num_positions, num_orientations=num_orientations, map=static_map().map) self.state_idx = None # Current state idx is unknown. self.curr_odom_pose = Pose() self.tf_helper = TFHelper() # Velocity publisher self.cmd_vel_publisher = rospy.Publisher("/cmd_vel", Twist, queue_size=10, latch=True) self.odom_subscriber = rospy.Subscriber('/odom', Odometry, self.set_odom) self.goal_state = None # Visualize robot self.robot_state_pub = rospy.Publisher('/robot_state_marker', Marker, queue_size=10) self.robot_state_pose_pub = rospy.Publisher('/robot_state_pose', PoseArray, queue_size=10) self.goal_state_pub = rospy.Publisher('/goal_state_marker', Marker, queue_size=10) # # pose_listener responds to selection of a new approximate robot location (for instance using rviz) # self.odom_pose = PoseStamped() self.odom_pose.header.stamp = rospy.Time(0) self.odom_pose.header.frame_id = 'odom' # rospy.Subscriber("initialpose", PoseWithCovarianceStamped, self.update_initial_pose) rospy.Subscriber("move_base_simple/goal", PoseStamped, self.update_goal_state)
def policy_iteration_step(): data = json.loads(request.data) size = data['size'] state_rewards_list = data['state_rewards_list'] state_rewards_dict = {tuple(k):v for k,v in state_rewards_list} blocked_states_list = [tuple(s) for s in data['blocked_states_list']] discount = data['discount'] values = np.array(data['values']) policy = np.array(data['policy']) mdp = MDP(state_rewards_dict, blocked_states_list, discount, size, values, policy) mdp.values = mdp.evaluate_policy_values() table = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list) return json.dumps({'table': table, 'values': mdp.values.tolist(), 'policy': mdp.policy.tolist()})