def modified_policy_iteration(mdp: MDP,
                              gamma: float,
                              epsilon: float,
                              k: int = 5) -> Tuple[Dict, Dict]:
    random_a = random.sample(mdp.A, 1)[0]
    pi = {s: random_a for s in mdp.S}
    V = {s: 0. for s in mdp.S}
    while True:
        for i in range(k):
            for s in mdp.S:
                V[s] = mdp.R(s, pi[s]) + gamma * sum([
                    mdp.P(s_prime, s, pi[s]) * V[s_prime] for s_prime in mdp.S
                ])
        delta = 0.
        for s in mdp.S:
            V_old = V[s]
            V_new = {
                a: mdp.R(s, a) + gamma *
                sum([mdp.P(s_prime, s, a) * V[s_prime] for s_prime in mdp.S])
                for a in mdp.A
            }
            pi[s] = max(V_new, key=V_new.get)
            V[s] = max(V_new.values())
            delta = max(delta, abs(V[s] - V_old))
        if delta <= epsilon:
            break
    return pi, V
Exemple #2
0
class Robot():
    def __init__(self):
        rospy.init_node("robot")
        self.config = read_config()
        self.astar_pub = rospy.Publisher("/results/path_list",
                                         AStarPath,
                                         queue_size=10)

        self.sim_complete_pub = rospy.Publisher("/map_node/sim_complete",
                                                Bool,
                                                queue_size=10)

        # publish A* path
        self.path_array = []
        self.publish_astar()

        # publish MDP
        print "running mdp"
        self.mdp = MDP()
        self.mdp.make_policy()

        rospy.sleep(1)
        self.sim_complete_pub.publish(True)
        rospy.sleep(1)
        rospy.signal_shutdown(Robot)

    def publish_astar(self):
        obj = Astar()
        obj.astar_func(self.path_array)
        for i in range(len(self.path_array)):
            rospy.sleep(1)
            msg = AStarPath()
            msg.data = self.path_array[i]
            self.astar_pub.publish(msg)
Exemple #3
0
    def __init__(self, env, gamma=.99):
        grid = EnvMDP.to_grid_matrix(env)
        reward = {}
        states = set()
        self.rows = len(grid)
        self.cols = len(grid[0])
        self.grid = grid

        for x in range(self.cols):
            for y in range(self.rows):
                if grid[y][x] is not None:
                    states.add((x, y))
                    reward[(x, y)] = grid[y][x]

        self.states = states

        terminals = EnvMDP.to_position(env, letter=b'GH')
        actlist = list(range(env.action_space.n))
        transitions = EnvMDP.to_transitions(env)
        init = EnvMDP.to_position(env, letter=b'S')[0]

        MDP.__init__(self,
                     init,
                     actlist=actlist,
                     terminals=terminals,
                     transitions=transitions,
                     reward=reward,
                     states=states,
                     gamma=gamma)
Exemple #4
0
def recommend_pathway(user_jobs, job_graph, goal_state, min_likelihood_thr):
    """
    Recommend a pathway, given the sequence of job titles.
    """
    user_jobs_for_mdp = [user_jobs[0]]
    mdp = MDP(job_graph, user_jobs_for_mdp, goal_state, min_likelihood_thr=min_likelihood_thr)
    return mdp.solve_mdp()
Exemple #5
0
def update():
    
    data = json.loads(request.data)
    size = data['size']
    state_rewards_list = data['state_rewards_list']
    state_rewards_dict = {tuple(k):v for k,v in state_rewards_list}
    blocked_states_list = [tuple(s) for s in data['blocked_states_list']]
    discount = data['discount']
    started = data['started']
        
    values = np.array(data['values'])
    policy = np.array(data['policy'])
    
    print(blocked_states_list)
    if started:
        mdp = MDP(state_rewards_dict, blocked_states_list,
                     discount, size, values, policy)
    else:
        mdp = MDP(state_rewards_dict, blocked_states_list,
                     discount, size)
    table = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy,
                            mdp.blocked_states_list)
    
    return json.dumps({'table': table, 'values': mdp.values.tolist(),
            'policy': mdp.policy.tolist()})
def random_vs_sarsa():
    plt.clf()
    # sarsa
    sarsa = np.zeros(num_episodes+1)
    a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=True, stochastic_wind=True)
    epsilon, alpha = 0.1, 0.5
    for seed in range(num_trials):        
        episodes = a.sarsa(seed, num_episodes, epsilon, alpha)
        sarsa += np.array(episodes)
    sarsa = np.cumsum(sarsa)/num_trials
    # random walk
    random_walk = np.zeros(num_episodes+1)
    for seed in range(num_trials):        
        episodes = a.random_walk(seed, num_episodes)
        random_walk += np.array(episodes)        
    random_walk = np.cumsum(random_walk)/num_trials

    plt.clf()
    y = np.arange(num_episodes+1)
    plt.plot(sarsa, y, label='sarsa')
    plt.plot(random_walk, y, label='random walk')    
    plt.xlabel("Time step")
    plt.ylabel("Episodes")
    plt.title("Sarsa(0) agent with 8 moves, stochastic wind")
    plt.grid(True)
    plt.legend()
    # plt.show()
    plt.savefig("plots/random_walk.png")
def task_5():
    plt.clf()
    a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=False, stochastic_wind=False)
    epsilon, alpha = 0.1, 0.5
    episodes_avg_t = np.zeros((3, num_episodes+1))

    for seed in range(num_trials):
        # sarsa
        episodes = a.sarsa(seed, num_episodes, epsilon, alpha)
        episodes_avg_t[0, :] += np.cumsum(np.array(episodes))
        # q-learning
        episodes = a.q_learning(seed, num_episodes, epsilon, alpha)
        episodes_avg_t[1, :] += np.cumsum(np.array(episodes))
        # expected sarsa
        episodes = a.expected_sarsa(seed, num_episodes, epsilon, alpha)
        episodes_avg_t[2, :] += np.cumsum(np.array(episodes))
    
    # normalise
    episodes_avg_t /= num_trials
    # y axis
    y = np.arange(num_episodes+1)
    plt.title("Comparison of various algos\n4 moves, no stochastic wind")
    plt.plot(episodes_avg_t[0], y, label='Sarsa')
    plt.plot(episodes_avg_t[1], y, label='Q-Learning')
    plt.plot(episodes_avg_t[2], y, label='Expected Sarsa')
    plt.xlabel("Time step")
    plt.ylabel("Episodes")
    plt.grid(True)
    plt.legend()
    # plt.show()
    plt.savefig("plots/t5.png")
Exemple #8
0
def part_iii_evaluation(sim_filename):
    print sim_filename
    mdp = MDP("blank_2_actions_81_states_mdp.txt")
    results = []
    # prior: assume each transition seen once
    transition_count = [[[0.1 for _ in range(81)] for _ in range(81)]
                        for _ in range(2)]

    for n in range(10):
        print "Big loop " + str(n)
        results.append([])
        for i in range(100):
            mdp, transition_count = adp_rl(mdp, Sim(MDP(sim_filename)),
                                           transition_count)
        value_fn, policy, iterations = plan(mdp, 0.99, 0.01)
        print "Value: " + str(value_fn)
        print "Policy: " + str(policy)
        #print "Reward: " + str(mdp.rewards)
        #print "Transitions: " + str(mdp.transitions)
        for i in range(100):
            reward = run_policy(Sim(MDP(sim_filename)), policy)
            results[n].append(reward)

        print "Average reward of policy: " + str(average(results[n]))

    for l in results:
        print average(l)
Exemple #9
0
    def modelBasedRL(self,
                     s0,
                     defaultT,
                     initialR,
                     nEpisodes,
                     nSteps,
                     epsilon=0):
        '''Model-based Reinforcement Learning with epsilon greedy 
        exploration.  This function should use value iteration,
        policy iteration or modified policy iteration to update the policy at each step

        Inputs:
        s0 -- initial state
        defaultT -- default transition function when a state-action pair has not been vsited
        initialR -- initial estimate of the reward function
        nEpisodes -- # of episodes (one episode consists of a trajectory of nSteps that starts in s0
        nSteps -- # of steps per episode
        epsilon -- probability with which an action is chosen at random

        Outputs: 
        V -- final value function
        policy -- final policy
        '''

        # temporary values to ensure that the code compiles until this
        # function is coded
        count_triple = np.ones(
            [self.mdp.nActions, self.mdp.nStates, self.mdp.nStates])
        cumu_reward_lst = np.zeros(nEpisodes)
        V = np.zeros(self.mdp.nStates)
        policy = np.zeros(self.mdp.nStates, int)
        mdp_tmp = MDP(defaultT, initialR, self.mdp.E, self.mdp.discount)

        for iterEp in range(nEpisodes):
            state = s0
            for iterSt in range(nSteps):
                action = 0
                if np.random.rand(1) < epsilon:
                    action = np.random.randint(self.mdp.nActions)
                else:
                    action = policy[state]
                [nextState, reward,
                 done] = self.sampleRewardAndNextState(state, action)
                cumu_reward_lst[iterEp] += self.mdp.discount**iterSt * reward
                count_triple[action, state, nextState] += 1
                count_double = np.sum(count_triple[action, state, :])
                mdp_tmp.T[action,
                          state, :] = count_triple[action,
                                                   state, :] / count_double
                mdp_tmp.R[action,
                          state] = (reward + (count_double - 1) *
                                    mdp_tmp.R[action, state]) / count_double
                [policy, V, iterId] = mdp_tmp.policyIteration(policy)
                state = nextState
                if (done):
                    break
        return [V, policy, cumu_reward_lst]
Exemple #10
0
    def __init__(self, grid, goalVals, discount=.99, tau=.01, epsilon=.001):

        MDP.__init__(self, discount=discount, tau=tau, epsilon=epsilon)

        self.goalVals = goalVals
        self.grid = grid

        self.setGridWorld()
        self.valueIteration()
        self.extractPolicy()
	def __init__(self, grid, goalVals, discount=.99, tau=.01, epsilon=.001):

		MDP.__init__(self, discount=discount, tau=tau, epsilon=epsilon)

		self.goalVals = goalVals
		self.grid = grid

		self.setGridWorld()
		self.valueIteration()
		self.extractPolicy()
Exemple #12
0
 def __init__(self, hist_duration, mdp_step, time_step, action_size, batch_size, mean, std, hdg0, src_file,
              sim_time):
     self.mdp = MDP(hist_duration, mdp_step, time_step)
     self.action_size = action_size
     self.agent = PolicyLearner(self.mdp.size, action_size, batch_size)
     self.agent.load(src_file)
     self.wh = wind(mean, std, int(mdp_step / time_step))
     self.hdg0 = hdg0
     self.src = src_file
     self.sim_time = sim_time
Exemple #13
0
 def runMDP(self):
     mdp = MDP(self.config)
     print "TESTING"
     while mdp.renameThis():
         #print "Iterate"
         result_policy = mdp.iterate()
         #print "publish"
         #print "Will Continue?", mdp.renameThis()
     print result_policy
     self.policies = result_policy
     util.print_2d_map(self.grid)
 def __init__(self):
     self._states = 1  #standaardwaarde, wordt later aangepast eens de omgeving bekend is
     self._mdp = MDP(1)
     self._qvalues = np.zeros((self._states, 4))
     self._vvalues = np.zeros(self._states)
     self._policy = np.ones((self._states, 4)) / 4
     self._learningRate = 0.8
     self._epsilonDecay = -0.005
     self._epsilon = 1.0
     self._epsilonMin = 0.01
     self._epsilonMax = 1.0
     self._count = 0
def task_2():    
    # simple 4 move
    res = np.zeros(num_episodes+1)
    a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=False, stochastic_wind=False)
    epsilon, alpha = 0.1, 0.5    
    for seed in range(num_trials):        
        episodes = a.sarsa(seed, num_episodes, epsilon, alpha)
        res += np.cumsum(np.array(episodes))
    res /= num_trials
    plot_one(res, 'Sarsa(0), 4 move agent', 't2.png')

    return res
def task_4():
    # stochastic wind
    res = np.zeros(num_episodes+1)
    a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=True, stochastic_wind=True)
    epsilon, alpha = 0.1, 0.5
    for seed in range(num_trials):        
        episodes = a.sarsa(seed, num_episodes, epsilon, alpha)
        res += np.array(episodes)
    res = np.cumsum(res)/num_trials
    plot_one(res, 'Sarsa(0), 8 move agent, stochastic wind', 't4.png')

    return res
Exemple #17
0
    def __init__(self, desc=None, map_name="4x4", slip_chance=0.2):
        if desc is None and map_name is None:
            raise ValueError('Must provide either desc or map_name')
        elif desc is None:
            desc = self.MAPS[map_name]
        assert ''.join(desc).count('S') == 1, "this implementation supports having exactly one initial state"
        assert all(c in "SFHG" for c in ''.join(desc)), "all cells must be either of S, F, H or G"

        self.desc = desc = np.asarray(list(map(list,desc)),dtype='str')
        self.lastaction = None

        nrow, ncol = desc.shape
        states = [(i, j) for i in range(nrow) for j in range(ncol)]
        actions = ["left","down","right","up"]

        initial_state = states[np.array(desc == b'S').ravel().argmax()]

        def move(row, col, movement):
            if movement== 'left':
                col = max(col-1,0)
            elif movement== 'down':
                row = min(row+1,nrow-1)
            elif movement== 'right':
                col = min(col+1,ncol-1)
            elif movement== 'up':
                row = max(row-1,0)
            else:
                raise("invalid action")
            return (row, col)

        transition_probs = {s : {} for s in states}
        rewards = {s : {} for s in states}
        for (row,col) in states:
            if desc[row, col]  in "GH": continue
            for action_i in range(len(actions)):
                action = actions[action_i]
                transition_probs[(row, col)][action] = {}
                rewards[(row, col)][action] = {}
                for movement_i in [(action_i - 1) % len(actions), action_i, (action_i + 1) % len(actions)]:
                    movement = actions[movement_i]
                    newrow, newcol = move(row, col, movement)
                    prob = (1. - slip_chance) if movement == action else (slip_chance / 2.)
                    if prob == 0: continue
                    if (newrow, newcol) not in transition_probs[row,col][action]:
                        transition_probs[row,col][action][newrow, newcol] = prob
                    else:
                        transition_probs[row, col][action][newrow, newcol] += prob
                    if desc[newrow, newcol] == 'G':
                        rewards[row,col][action][newrow, newcol] = 1.0

        MDP.__init__(self, transition_probs, rewards, initial_state)
def graph_decay_score(scale, rand=False):
    """
    Function to generate a graph for the exponential decay score over a range of k
    :param scale: the limit to which k should vary
    :param rand: to use a random policy or not
    :return: None
    """

    fig = plt.figure()
    x = [i + 1 for i in range(scale)]
    y_decay = []
    for i in x:
        rs = MDP(path='data-mini', k=i)

        if rand:
            rs.initialise_mdp()
            y_decay.append(rs.evaluate_decay_score())
            continue

        rs.load('mdp-model_k=' + str(i) + '.pkl')
        y_decay.append(rs.evaluate_decay_score())

    plt.bar(x, y_decay, width=0.5, color=(0.2, 0.4, 0.6, 0.6))
    xlocs = [i + 1 for i in range(0, 10)]
    for i, v in enumerate(y_decay):
        plt.text(xlocs[i] - 0.46, v + 0.9, '%.2f' % v)

    plt.xticks(x)
    plt.yticks([i for i in range(0, 100, 10)])

    fig.suptitle(
        'Avg Exponential Decay Score vs Number of items in each state')
    plt.xlabel('K')
    plt.ylabel('Score')
    plt.show()
Exemple #19
0
def run(resolution, knn, lookahead, gamma, episodes, render=False):
    env = gym.make("MountainCar-v0")
    env_unwrapped = env.unwrapped
    discretizer = Discretizer(resolution, resolution, knn)
    print(f"Discretizing at resoluton {resolution}, knn {knn}.")
    S, A, P, R = discretizer(env_unwrapped, zero_reward_if_done=True)
    mdp = MDP(S, A, P, R, 200, gamma)
    print(f"Running value iteration with lookahead {lookahead}.")
    V, pi, vi_iterations = value_iteration(mdp, lookahead=lookahead)
    steps = []
    for _ in range(episodes):
        observation = env.reset()
        for t in count(1):
            if render:
                env.render()
            d_obs = discretizer.discretize_state(observation)
            action = pi[np.random.choice(list(d_obs.keys()),
                                         p=list(d_obs.values()))]
            observation, _, done, _ = env.step(action)
            if done:
                steps.append(t)
                break
    env.close()
    print(f"Average steps over {episodes} episodes: {np.mean(steps)}.")
    return vi_iterations, V, pi, steps
    def __init__(self,
                 initial,
                 nrows=8,
                 ncols=8,
                 nagents=1,
                 targets=[],
                 obstacles=[],
                 moveobstacles=[],
                 regions=dict(),
                 preferred_acts=set()):
        # walls are the obstacles. The edges of the gridworld will be included into the walls.
        # region is a string and can be one of: ['pavement','gravel', 'grass', 'sand']
        self.current = initial
        self.nrows = nrows
        self.ncols = ncols
        self.nagents = nagents
        self.nstates = nrows * ncols
        self.nactions = 5
        self.regions = regions
        self.actlist = ['N', 'S', 'W', 'E', 'R']
        self.targets = targets
        self.left_edge = []
        self.right_edge = []
        self.top_edge = []
        self.bottom_edge = []
        self.obstacles = obstacles
        self.moveobstacles = moveobstacles
        self.states = range(nrows * ncols)
        self.colorstates = set()
        for x in range(self.nstates):
            # note that edges are not disjoint, so we cannot use elif
            if x % self.ncols == 0:
                self.left_edge.append(x)
            if 0 <= x < self.ncols:
                self.top_edge.append(x)
            if x % self.ncols == self.ncols - 1:
                self.right_edge.append(x)
            if (self.nrows - 1) * self.ncols <= x <= self.nstates:
                self.bottom_edge.append(x)
        self.edges = self.left_edge + self.top_edge + self.right_edge + self.bottom_edge
        self.walls = self.edges + obstacles
        self.prob = {
            a: np.zeros((self.nstates, self.nstates))
            for a in self.actlist
        }

        self.probOfSuccess = dict([])
        self.getProbRegions()
        for s in self.states:
            for a in self.actlist:
                self.getProbs(s, a)
        transitions = set()
        for s in self.states:
            for a in self.actlist:
                for t in np.nonzero(
                        self.prob[self.actlist[self.actlist.index(a)]][s])[0]:
                    p = self.prob[self.actlist[self.actlist.index(a)]][s][t]
                    transitions.add((s, a, t, p))

        self.mdp = MDP(self.states, self.actlist, transitions)
def main(userNum = '59945701'):

    agent = MDP(path = 'data-mini', k = 3) #Create instance for MDP class
    agent.initializeMDP()                  #Initialize States, Actions, Probabilites and initial Rewards
    rewardEvolution = agent.policyIteration()                # The algorithm that solves MDP 
    recommendation = agent.recommend(userNum)  #Use recommendation function 
    evaluationRS = agent.evaluateRecommendationScore() #Evaluation score
    evaluationED = agent.evaluateDecayScore()           #Another evaluation score
    return recommendation, evaluationRS, evaluationED, userNum, rewardEvolution
Exemple #22
0
 def Mazes_generator(self,batch_size):
     Mazes = []
     for MzIter in range(batch_size):
         [T,R,E] = maze_generator()
         mdp = MDP(T,R,E,self.rl.mdp.discount)
         rlSample = RL(mdp,np.random.normal)
         Mazes.append(rlSample)
     return Mazes
def async_value_iteration(mdp: MDP,
                          gamma: float,
                          num_iterations: int = 1000) -> Tuple[Dict, Dict]:
    Q = {(s, a): 0. for a in mdp.A for s in mdp.S}
    for i in range(num_iterations):
        s = random.sample(mdp.S, 1)[0]
        a = random.sample(mdp.A, 1)[0]
        Q[(s, a)] = mdp.R(s, a) + gamma * sum([
            mdp.P(s_prime, s, a) *
            max([Q[(s_prime, a_prime)] for a_prime in mdp.A])
            for s_prime in mdp.S
        ])
    pi = {}
    for s in mdp.S:
        values = {a: Q[(s, a)] for a in mdp.A}
        pi[s] = max(values, key=values.get)
    return pi, Q
def graph_recommendation_score(scale=4, m=10, with_comparison=False):
    """
    Function to generate a graph for the recommendation score over a range of m for a set of k
    :param scale: the limit to which k should vary
    :param m: a parameter in recommendation score computation
    :param with_comparison: plot a random policy's graph
    :return: None
    """

    fig = plt.figure()
    k = [i + 1 for i in range(1, scale)]
    x = [i + 1 for i in range(m)]
    for j in k:
        y_recommendation = []
        y_recommendation_rand = []
        rs = MDP(path='data-mini', k=j)
        rs.load('mdp-model_k=' + str(j) + '.pkl')
        for i in x:
            if with_comparison:
                rs.initialise_mdp()
                y_recommendation_rand.append(
                    rs.evaluate_recommendation_score(m=i))
            y_recommendation.append(rs.evaluate_recommendation_score(m=i))

        plt.plot(x,
                 y_recommendation,
                 color=(0.2 + (j - 2) * 0.4, 0.4, 0.6, 0.6),
                 label="MC model " + str(j))
        plt.scatter(x,
                    y_recommendation,
                    color=(0.2 + (j - 2) * 0.4, 0.4, 0.6, 0.6))

        if with_comparison:
            plt.plot(x,
                     y_recommendation_rand,
                     color=(0.2, 0.8, 0.6, 0.6),
                     label="Random model, For m=" + str(m))
            plt.scatter(x, y_recommendation_rand)

        plt.xticks(x)
        plt.yticks([i for i in range(20, 100, 10)])

        for x1, y in zip(x, y_recommendation):
            text = '%.2f' % y
            plt.text(x1, y, text)

        if with_comparison:
            for x1, y in zip(x, y_recommendation_rand):
                text = '%.2f' % y
                plt.text(x1, y, text)

    fig.suptitle('Recommendation Score vs Prediction List size')
    plt.xlabel('Prediction List size')
    plt.ylabel('Score')
    plt.legend()
    plt.show()
Exemple #25
0
    def beginSimulation(self):
        result_astar = astar(self.config)
        self.publishAStar(result_astar)

        mdp = MDP(self.config)

#INPUT A GRID TODO
        print "TESTING"
        while mdp.renameThis():
            print "Iterate"
            result_policy = mdp.iterate()
            print "publish"
            self.resultsPolicyPub.publish(result_policy)
            print "Will Continue?", mdp.renameThis()

        self.simulationCompletePub.publish(True)
        rospy.sleep(10)
        rospy.signal_shutdown("Simulation has Completed")
Exemple #26
0
def run_experiment3(grid, fleet, horizon):
    # solve mdp and run a simulation
    # returns profile of the simulation

    mdp = MDP(fleet, grid, horizon, get_prices_func=deterministic_prices)
    profile_mdp_simulation(mdp, "out/experiment3_coordinated.csv")

    mdp = UncoordinatedMDP(fleet, grid, horizon, get_prices_func=deterministic_prices)
    profile_mdp_simulation(mdp, "out/experiment3_uncoordinated.csv")
Exemple #27
0
    def search_exe(self):

        Astar()
        #self.path_pub.publish(path)
        MDP()
        QL()
        self.finish_pub.publish(True)
        rospy.sleep(10)
        rospy.signal_shutdown("Finish Simulation")
Exemple #28
0
    def __init__(self, num_positions=500, num_orientations=10):
        # TODO: Interface with SLAM algorithm's published map
        # Initialize map.
        rospy.init_node(
            "neato_mdp")  # May break if markov_model is also subscribed...?
        rospy.wait_for_service("static_map")
        static_map = rospy.ServiceProxy("static_map", GetMap)
        # Initialize MDP
        self.mdp = MDP(num_positions=num_positions,
                       num_orientations=num_orientations,
                       map=static_map().map)
        self.state_idx = None  # Current state idx is unknown.
        self.curr_odom_pose = Pose()
        self.tf_helper = TFHelper()
        # Velocity publisher
        self.cmd_vel_publisher = rospy.Publisher("/cmd_vel",
                                                 Twist,
                                                 queue_size=10,
                                                 latch=True)
        self.odom_subscriber = rospy.Subscriber('/odom', Odometry,
                                                self.set_odom)
        self.goal_state = None
        # Visualize robot
        self.robot_state_pub = rospy.Publisher('/robot_state_marker',
                                               Marker,
                                               queue_size=10)
        self.robot_state_pose_pub = rospy.Publisher('/robot_state_pose',
                                                    PoseArray,
                                                    queue_size=10)
        self.goal_state_pub = rospy.Publisher('/goal_state_marker',
                                              Marker,
                                              queue_size=10)
        # # pose_listener responds to selection of a new approximate robot location (for instance using rviz)
        #
        self.odom_pose = PoseStamped()
        self.odom_pose.header.stamp = rospy.Time(0)
        self.odom_pose.header.frame_id = 'odom'
        #
        rospy.Subscriber("initialpose", PoseWithCovarianceStamped,
                         self.update_initial_pose)

        rospy.Subscriber("move_base_simple/goal", PoseStamped,
                         self.update_goal_state)
Exemple #29
0
 def __init__(self,
              rows,
              cols,
              definitiveness,
              initstate,
              terminals,
              obstacles,
              gamma=.9):
     self.rows = rows
     self.cols = cols
     self.definitiveness = definitiveness
     self.initstate = initstate
     self.terminals = terminals
     self.obstacles = obstacles
     stateset = set()
     for y in range(1, self.cols + 1):
         for x in range(1, self.rows + 1):
             stateset.add((x, y))
     actionset = {'up', 'down', 'right', 'left'}
     MDP.__init__(self, stateset, actionset, gamma)
Exemple #30
0
def policy_iteration_step():
        
    data = json.loads(request.data)
    size = data['size']
    state_rewards_list = data['state_rewards_list']
    state_rewards_dict = {tuple(k):v for k,v in state_rewards_list}
    blocked_states_list = [tuple(s) for s in data['blocked_states_list']]
    discount = data['discount']
    
    values = np.array(data['values'])
    policy = np.array(data['policy'])
    
    mdp = MDP(state_rewards_dict, blocked_states_list,
                     discount, size, values, policy)
    mdp.values = mdp.evaluate_policy_values()
    
    table = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list)
    
    return json.dumps({'table': table, 'values': mdp.values.tolist(),
            'policy': mdp.policy.tolist()})
Exemple #31
0
def run_experiment2(line_bound, horizon, output_file):
    # increasing number of vehicles
    ev_s0 = {}
    profit_increase_rate = {1: 1}
    pre_processing_time = {}
    processing_time = {}
    average_reward = {}
    error_reward = {}

    initial_time = time.time()
    grid = Grid.create_tree_grid(high=TREE_HIGH, branch_factor=TREE_BRANCHING_FACTOR, line_bound=line_bound)
    grid_initialization_time = time.time() - initial_time

    for num_vehicles in range(1, MAX_NUMBER_OF_CARS + 1):
        pos_vehicles = [i % (grid.n_nodes - 1) + 1 for i in range(num_vehicles)]
        fleet = init_ev_fleet(4, pos_vehicles, horizon)
        grid.save_to_dot_file_with_fleet(fleet, "grids/grid_experiment2_fleet{}.dot".format(num_vehicles))

        mdp = MDP(fleet, grid, horizon, get_prices_func=deterministic_prices)
        results = mdp.solve_get_stats()

        print(num_vehicles, results)

        pre_processing_time[num_vehicles] = results["Feasible actions computational time"] + grid_initialization_time
        processing_time[num_vehicles] = results["Optimization time"]
        ev_s0[num_vehicles] = results["Expected value initial state"]
        average_reward[num_vehicles] = results["average_reward"]
        error_reward[num_vehicles] = results["error"]
        if num_vehicles > 1:
            profit_increase_rate[num_vehicles] = (ev_s0[num_vehicles] - ev_s0[num_vehicles - 1]) / ev_s0[1]

        data_frame = pd.DataFrame.from_dict(
            {"Expected value": ev_s0,
             "Profit increase rate": profit_increase_rate,
             "Processing time": processing_time,
             "Average reward": average_reward,
             "error_reward": error_reward,
             "Preprocessing time": pre_processing_time
             }
        )
        data_frame.to_csv(output_file)
Exemple #32
0
def index():
    
    
    size = 10
    state_rewards_dict = {(6,6):1, (0,0):1}
    blocked_states_list = [(2, 3), (1, 3), (0, 3), (4, 8), (5, 8), (6, 8),
                           (5, 2), (6, 2), (7, 2), (8, 2), (8, 3), (8, 4)]
    discount=.9
    mdp = MDP(state_rewards_dict, blocked_states_list,
                     discount, size=size)
        
    table = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list)
    state_rewards_list = [[list(k),v] for k,v in state_rewards_dict.items()]
    return render_template("index.html",
                table=table,
                size=size,
                state_rewards_list=state_rewards_list,
                blocked_states_list=[list(s) for s in blocked_states_list],
                discount=discount,
                values=mdp.values.tolist(),
                policy=mdp.policy.tolist())
Exemple #33
0
    print
    print "=== {} ===".format(name)
    print "non-stationary value function:"
    print_value_function(V)
    print
    print "policy:"
    print_policy(policy)
    print
    print "============================================================"
    print
    print

# PROBLEM 1

# load MDP debug
mdp = MDP()
mdp.load_from_file('MDP_debug.txt')

# run finite horizon value iteration
H = 10
(V, policy) = MDPOptimization.finite_horizon_value_iteration(mdp, H)
print_helper(V, policy, "MDP Debug")

# PROBLEM 2

# load custom MDP
mdp = MDP()
mdp.load_from_file('MDP_custom.txt')

# run finite horizon value iteration
H = 10
Exemple #34
0
def create_mdp_three_directions(width, height):
    """ Create the grid world MDP without a reward function defined. This has 3 directions
        of movement possible.

        Parameters:
            width -- The width of the grid world.
            height -- The height of the grid world.

        Returns:
            The MDP object with states, actions, and transitions, but no reward.
    """

    mdp = MDP()

    # Create the states.
    for x in range(width):
        for y in range(height):
            mdp.S |= {(x, y)}

    # Create the actions.
    mdp.A = {"n", "s", "e", "w"}

    # Create the transition probabilities.
    for s in mdp.S:
        for a in mdp.A:
            for sp in mdp.S:
                mdp.P[(s, a, sp)] = 0.0

    for sx, sy in mdp.S:
        if sy > 0:
            mdp.P[((sx, sy), "n", (sx, sy - 1))] = 0.8
            if sx == 0:
                mdp.P[((sx, sy), "n", (sx + 1, sy))] = 0.2
            elif sx == width - 1:
                mdp.P[((sx, sy), "n", (sx - 1, sy))] = 0.2
            else:
                mdp.P[((sx, sy), "n", (sx - 1, sy))] = 0.1
                mdp.P[((sx, sy), "n", (sx + 1, sy))] = 0.1
        else:
            if sx == 0:
                mdp.P[((sx, sy), "n", (sx, sy))] = 0.9
                mdp.P[((sx, sy), "n", (sx + 1, sy))] = 0.1
            elif sx == width - 1:
                mdp.P[((sx, sy), "n", (sx, sy))] = 0.9
                mdp.P[((sx, sy), "n", (sx - 1, sy))] = 0.1
            else:
                mdp.P[((sx, sy), "n", (sx, sy))] = 0.8
                mdp.P[((sx, sy), "n", (sx - 1, sy))] = 0.1
                mdp.P[((sx, sy), "n", (sx + 1, sy))] = 0.1

        if sy < height - 1:
            mdp.P[((sx, sy), "s", (sx, sy + 1))] = 0.8
            if sx == 0:
                mdp.P[((sx, sy), "s", (sx + 1, sy))] = 0.2
            elif sx == width - 1:
                mdp.P[((sx, sy), "s", (sx - 1, sy))] = 0.2
            else:
                mdp.P[((sx, sy), "s", (sx - 1, sy))] = 0.1
                mdp.P[((sx, sy), "s", (sx + 1, sy))] = 0.1
        else:
            if sx == 0:
                mdp.P[((sx, sy), "s", (sx, sy))] = 0.9
                mdp.P[((sx, sy), "s", (sx + 1, sy))] = 0.1
            elif sx == width - 1:
                mdp.P[((sx, sy), "s", (sx, sy))] = 0.9
                mdp.P[((sx, sy), "s", (sx - 1, sy))] = 0.1
            else:
                mdp.P[((sx, sy), "s", (sx, sy))] = 0.8
                mdp.P[((sx, sy), "s", (sx - 1, sy))] = 0.1
                mdp.P[((sx, sy), "s", (sx + 1, sy))] = 0.1

        if sx > 0:
            mdp.P[((sx, sy), "w", (sx - 1, sy))] = 0.8
            if sy == 0:
                mdp.P[((sx, sy), "w", (sx, sy + 1))] = 0.2
            elif sy == height - 1:
                mdp.P[((sx, sy), "w", (sx, sy - 1))] = 0.2
            else:
                mdp.P[((sx, sy), "w", (sx, sy - 1))] = 0.1
                mdp.P[((sx, sy), "w", (sx, sy + 1))] = 0.1
        else:
            if sy == 0:
                mdp.P[((sx, sy), "w", (sx, sy))] = 0.9
                mdp.P[((sx, sy), "w", (sx, sy + 1))] = 0.1
            elif sy == height - 1:
                mdp.P[((sx, sy), "w", (sx, sy))] = 0.9
                mdp.P[((sx, sy), "w", (sx, sy - 1))] = 0.1
            else:
                mdp.P[((sx, sy), "w", (sx, sy))] = 0.8
                mdp.P[((sx, sy), "w", (sx, sy - 1))] = 0.1
                mdp.P[((sx, sy), "w", (sx, sy + 1))] = 0.1

        if sx < width - 1:
            mdp.P[((sx, sy), "e", (sx + 1, sy))] = 0.8
            if sy == 0:
                mdp.P[((sx, sy), "e", (sx, sy + 1))] = 0.2
            elif sy == height - 1:
                mdp.P[((sx, sy), "e", (sx, sy - 1))] = 0.2
            else:
                mdp.P[((sx, sy), "e", (sx, sy - 1))] = 0.1
                mdp.P[((sx, sy), "e", (sx, sy + 1))] = 0.1
        else:
            if sy == 0:
                mdp.P[((sx, sy), "e", (sx, sy))] = 0.9
                mdp.P[((sx, sy), "e", (sx, sy + 1))] = 0.1
            elif sy == height - 1:
                mdp.P[((sx, sy), "e", (sx, sy))] = 0.9
                mdp.P[((sx, sy), "e", (sx, sy - 1))] = 0.1
            else:
                mdp.P[((sx, sy), "e", (sx, sy))] = 0.8
                mdp.P[((sx, sy), "e", (sx, sy - 1))] = 0.1
                mdp.P[((sx, sy), "e", (sx, sy + 1))] = 0.1

    return mdp
Exemple #35
0
def print_parking_helper(V, policy, mdp, name):
    print "============================================================"
    print
    print "=== {} ===".format(name)
    print "value function:"
    print_parking_value_function(V, mdp)
    print
    print "policy:"
    print_parking_policy(policy, mdp)
    print
    print "============================================================"
    print
    print

### PROBLEM 2
mdp = MDP()

# load MDP1
mdp.load_from_file('MDP1.txt')

epsilon = 0.000001

# run infinite horizon value iteration and policy iteration
beta = 0.1
(V, policy) = InfiniteHorizonPolicyOptimization.value_iteration(mdp, beta, epsilon)
print_helper(V, policy, "MDP1 value iteration, beta={}, epsilon={}".format(beta, epsilon))
(V, policy) = InfiniteHorizonPolicyOptimization.policy_iteration(mdp, beta)
print_helper(V, policy, "MDP1 policy iteration, beta={}".format(beta))

# run infinite horizon value iteration and policy iteration
beta = 0.9