コード例 #1
0
ファイル: CS533-4.py プロジェクト: sharpau/CS533-4
def part_iii_evaluation(sim_filename):
    print sim_filename
    mdp = MDP("blank_2_actions_81_states_mdp.txt")
    results = []
    # prior: assume each transition seen once
    transition_count = [[[0.1 for _ in range(81)] for _ in range(81)]
                        for _ in range(2)]

    for n in range(10):
        print "Big loop " + str(n)
        results.append([])
        for i in range(100):
            mdp, transition_count = adp_rl(mdp, Sim(MDP(sim_filename)),
                                           transition_count)
        value_fn, policy, iterations = plan(mdp, 0.99, 0.01)
        print "Value: " + str(value_fn)
        print "Policy: " + str(policy)
        #print "Reward: " + str(mdp.rewards)
        #print "Transitions: " + str(mdp.transitions)
        for i in range(100):
            reward = run_policy(Sim(MDP(sim_filename)), policy)
            results[n].append(reward)

        print "Average reward of policy: " + str(average(results[n]))

    for l in results:
        print average(l)
コード例 #2
0
ファイル: app.py プロジェクト: bmontambault/mdp
def update():
    
    data = json.loads(request.data)
    size = data['size']
    state_rewards_list = data['state_rewards_list']
    state_rewards_dict = {tuple(k):v for k,v in state_rewards_list}
    blocked_states_list = [tuple(s) for s in data['blocked_states_list']]
    discount = data['discount']
    started = data['started']
        
    values = np.array(data['values'])
    policy = np.array(data['policy'])
    
    print(blocked_states_list)
    if started:
        mdp = MDP(state_rewards_dict, blocked_states_list,
                     discount, size, values, policy)
    else:
        mdp = MDP(state_rewards_dict, blocked_states_list,
                     discount, size)
    table = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy,
                            mdp.blocked_states_list)
    
    return json.dumps({'table': table, 'values': mdp.values.tolist(),
            'policy': mdp.policy.tolist()})
コード例 #3
0
ファイル: app.py プロジェクト: bmontambault/mdp
def value_iteration():
    
    size = 5
    state_rewards_dict = {(3,3):1, (0,0):2}
    blocked_states_list = [(2,3), (2,4), (2,2)]
    discount=.9
    mdp = MDP(state_rewards_dict, blocked_states_list,
                     discount, size=size)
        
    table1 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list)
    
    mdp.values = mdp.evaluate_values()
    table2 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list)
    
    mdp.values = mdp.evaluate_values()
    table3 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list)
    
    
    mdp.values = mdp.evaluate_values()
    table4 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list)
    
    
    mdp.value_iteration()
    table5 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list)
    value_table6 = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list)
    policy_table6 = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list,
                                    show_policy=True)
            
    size = 10
    state_rewards_dict = {(6,6):1, (0,0):1}
    blocked_states_list = [(2, 3), (1, 3), (0, 3), (4, 8), (5, 8), (6, 8),
                           (5, 2), (6, 2), (7, 2), (8, 2), (8, 3), (8, 4)]
    discount=.9
    mdp = MDP(state_rewards_dict, blocked_states_list,
                     discount, size=size)
    
    table = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list)
    state_rewards_list = [[list(k),v] for k,v in state_rewards_dict.items()]
    return render_template("value_iteration.html",
                table1=table1,
                table2=table2,
                table3=table3,
                table4=table4,
                table5=table5,
                value_table6=value_table6,
                policy_table6=policy_table6,
                table=table,
                size=size,
                state_rewards_list=state_rewards_list,
                blocked_states_list=[list(s) for s in blocked_states_list],
                discount=discount,
                values=mdp.values.tolist(),
                policy=mdp.policy.tolist())
コード例 #4
0
def recommend_pathway(user_jobs, job_graph, goal_state, min_likelihood_thr):
    """
    Recommend a pathway, given the sequence of job titles.
    """
    user_jobs_for_mdp = [user_jobs[0]]
    mdp = MDP(job_graph, user_jobs_for_mdp, goal_state, min_likelihood_thr=min_likelihood_thr)
    return mdp.solve_mdp()
コード例 #5
0
def graph_decay_score(scale, rand=False):
    """
    Function to generate a graph for the exponential decay score over a range of k
    :param scale: the limit to which k should vary
    :param rand: to use a random policy or not
    :return: None
    """

    fig = plt.figure()
    x = [i + 1 for i in range(scale)]
    y_decay = []
    for i in x:
        rs = MDP(path='data-mini', k=i)

        if rand:
            rs.initialise_mdp()
            y_decay.append(rs.evaluate_decay_score())
            continue

        rs.load('mdp-model_k=' + str(i) + '.pkl')
        y_decay.append(rs.evaluate_decay_score())

    plt.bar(x, y_decay, width=0.5, color=(0.2, 0.4, 0.6, 0.6))
    xlocs = [i + 1 for i in range(0, 10)]
    for i, v in enumerate(y_decay):
        plt.text(xlocs[i] - 0.46, v + 0.9, '%.2f' % v)

    plt.xticks(x)
    plt.yticks([i for i in range(0, 100, 10)])

    fig.suptitle(
        'Avg Exponential Decay Score vs Number of items in each state')
    plt.xlabel('K')
    plt.ylabel('Score')
    plt.show()
コード例 #6
0
def run(resolution, knn, lookahead, gamma, episodes, render=False):
    env = gym.make("MountainCar-v0")
    env_unwrapped = env.unwrapped
    discretizer = Discretizer(resolution, resolution, knn)
    print(f"Discretizing at resoluton {resolution}, knn {knn}.")
    S, A, P, R = discretizer(env_unwrapped, zero_reward_if_done=True)
    mdp = MDP(S, A, P, R, 200, gamma)
    print(f"Running value iteration with lookahead {lookahead}.")
    V, pi, vi_iterations = value_iteration(mdp, lookahead=lookahead)
    steps = []
    for _ in range(episodes):
        observation = env.reset()
        for t in count(1):
            if render:
                env.render()
            d_obs = discretizer.discretize_state(observation)
            action = pi[np.random.choice(list(d_obs.keys()),
                                         p=list(d_obs.values()))]
            observation, _, done, _ = env.step(action)
            if done:
                steps.append(t)
                break
    env.close()
    print(f"Average steps over {episodes} episodes: {np.mean(steps)}.")
    return vi_iterations, V, pi, steps
コード例 #7
0
def random_vs_sarsa():
    plt.clf()
    # sarsa
    sarsa = np.zeros(num_episodes+1)
    a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=True, stochastic_wind=True)
    epsilon, alpha = 0.1, 0.5
    for seed in range(num_trials):        
        episodes = a.sarsa(seed, num_episodes, epsilon, alpha)
        sarsa += np.array(episodes)
    sarsa = np.cumsum(sarsa)/num_trials
    # random walk
    random_walk = np.zeros(num_episodes+1)
    for seed in range(num_trials):        
        episodes = a.random_walk(seed, num_episodes)
        random_walk += np.array(episodes)        
    random_walk = np.cumsum(random_walk)/num_trials

    plt.clf()
    y = np.arange(num_episodes+1)
    plt.plot(sarsa, y, label='sarsa')
    plt.plot(random_walk, y, label='random walk')    
    plt.xlabel("Time step")
    plt.ylabel("Episodes")
    plt.title("Sarsa(0) agent with 8 moves, stochastic wind")
    plt.grid(True)
    plt.legend()
    # plt.show()
    plt.savefig("plots/random_walk.png")
コード例 #8
0
def task_5():
    plt.clf()
    a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=False, stochastic_wind=False)
    epsilon, alpha = 0.1, 0.5
    episodes_avg_t = np.zeros((3, num_episodes+1))

    for seed in range(num_trials):
        # sarsa
        episodes = a.sarsa(seed, num_episodes, epsilon, alpha)
        episodes_avg_t[0, :] += np.cumsum(np.array(episodes))
        # q-learning
        episodes = a.q_learning(seed, num_episodes, epsilon, alpha)
        episodes_avg_t[1, :] += np.cumsum(np.array(episodes))
        # expected sarsa
        episodes = a.expected_sarsa(seed, num_episodes, epsilon, alpha)
        episodes_avg_t[2, :] += np.cumsum(np.array(episodes))
    
    # normalise
    episodes_avg_t /= num_trials
    # y axis
    y = np.arange(num_episodes+1)
    plt.title("Comparison of various algos\n4 moves, no stochastic wind")
    plt.plot(episodes_avg_t[0], y, label='Sarsa')
    plt.plot(episodes_avg_t[1], y, label='Q-Learning')
    plt.plot(episodes_avg_t[2], y, label='Expected Sarsa')
    plt.xlabel("Time step")
    plt.ylabel("Episodes")
    plt.grid(True)
    plt.legend()
    # plt.show()
    plt.savefig("plots/t5.png")
コード例 #9
0
    def __init__(self,
                 initial,
                 nrows=8,
                 ncols=8,
                 nagents=1,
                 targets=[],
                 obstacles=[],
                 moveobstacles=[],
                 regions=dict(),
                 preferred_acts=set()):
        # walls are the obstacles. The edges of the gridworld will be included into the walls.
        # region is a string and can be one of: ['pavement','gravel', 'grass', 'sand']
        self.current = initial
        self.nrows = nrows
        self.ncols = ncols
        self.nagents = nagents
        self.nstates = nrows * ncols
        self.nactions = 5
        self.regions = regions
        self.actlist = ['N', 'S', 'W', 'E', 'R']
        self.targets = targets
        self.left_edge = []
        self.right_edge = []
        self.top_edge = []
        self.bottom_edge = []
        self.obstacles = obstacles
        self.moveobstacles = moveobstacles
        self.states = range(nrows * ncols)
        self.colorstates = set()
        for x in range(self.nstates):
            # note that edges are not disjoint, so we cannot use elif
            if x % self.ncols == 0:
                self.left_edge.append(x)
            if 0 <= x < self.ncols:
                self.top_edge.append(x)
            if x % self.ncols == self.ncols - 1:
                self.right_edge.append(x)
            if (self.nrows - 1) * self.ncols <= x <= self.nstates:
                self.bottom_edge.append(x)
        self.edges = self.left_edge + self.top_edge + self.right_edge + self.bottom_edge
        self.walls = self.edges + obstacles
        self.prob = {
            a: np.zeros((self.nstates, self.nstates))
            for a in self.actlist
        }

        self.probOfSuccess = dict([])
        self.getProbRegions()
        for s in self.states:
            for a in self.actlist:
                self.getProbs(s, a)
        transitions = set()
        for s in self.states:
            for a in self.actlist:
                for t in np.nonzero(
                        self.prob[self.actlist[self.actlist.index(a)]][s])[0]:
                    p = self.prob[self.actlist[self.actlist.index(a)]][s][t]
                    transitions.add((s, a, t, p))

        self.mdp = MDP(self.states, self.actlist, transitions)
コード例 #10
0
 def Mazes_generator(self,batch_size):
     Mazes = []
     for MzIter in range(batch_size):
         [T,R,E] = maze_generator()
         mdp = MDP(T,R,E,self.rl.mdp.discount)
         rlSample = RL(mdp,np.random.normal)
         Mazes.append(rlSample)
     return Mazes
コード例 #11
0
def experiment_low_price_prob(fleet, grid, horizon, price_transition, step_size=0.01):
    results = {}
    simulation_results = {}
    for p in np.arange(0.0, 1 + step_size, step_size):
        mdp = MDP(fleet, grid, horizon,
                  get_prices_func=get_prices,
                  price_transition_func=price_transition(p))
        policy, expected_val = mdp.value_iteration()
        results[p] = expected_val[0][0]

        mdp = MDP(fleet, grid, horizon,
                  get_prices_func=get_prices,
                  price_transition_func=get_history_dependent_price_transition_func(p))
        new_results = mdp.run_simulations(policy=policy, initial_state=0, repetitions=5000)
        simulation_results[p] = new_results["average_reward"]

    return results, simulation_results
コード例 #12
0
def run_experiment3(grid, fleet, horizon):
    # solve mdp and run a simulation
    # returns profile of the simulation

    mdp = MDP(fleet, grid, horizon, get_prices_func=deterministic_prices)
    profile_mdp_simulation(mdp, "out/experiment3_coordinated.csv")

    mdp = UncoordinatedMDP(fleet, grid, horizon, get_prices_func=deterministic_prices)
    profile_mdp_simulation(mdp, "out/experiment3_uncoordinated.csv")
コード例 #13
0
    def search_exe(self):

        Astar()
        #self.path_pub.publish(path)
        MDP()
        QL()
        self.finish_pub.publish(True)
        rospy.sleep(10)
        rospy.signal_shutdown("Finish Simulation")
コード例 #14
0
    def modelBasedRL(self,
                     s0,
                     defaultT,
                     initialR,
                     nEpisodes,
                     nSteps,
                     epsilon=0):
        '''Model-based Reinforcement Learning with epsilon greedy 
        exploration.  This function should use value iteration,
        policy iteration or modified policy iteration to update the policy at each step

        Inputs:
        s0 -- initial state
        defaultT -- default transition function when a state-action pair has not been vsited
        initialR -- initial estimate of the reward function
        nEpisodes -- # of episodes (one episode consists of a trajectory of nSteps that starts in s0
        nSteps -- # of steps per episode
        epsilon -- probability with which an action is chosen at random

        Outputs: 
        V -- final value function
        policy -- final policy
        '''

        # temporary values to ensure that the code compiles until this
        # function is coded
        count_triple = np.ones(
            [self.mdp.nActions, self.mdp.nStates, self.mdp.nStates])
        cumu_reward_lst = np.zeros(nEpisodes)
        V = np.zeros(self.mdp.nStates)
        policy = np.zeros(self.mdp.nStates, int)
        mdp_tmp = MDP(defaultT, initialR, self.mdp.E, self.mdp.discount)

        for iterEp in range(nEpisodes):
            state = s0
            for iterSt in range(nSteps):
                action = 0
                if np.random.rand(1) < epsilon:
                    action = np.random.randint(self.mdp.nActions)
                else:
                    action = policy[state]
                [nextState, reward,
                 done] = self.sampleRewardAndNextState(state, action)
                cumu_reward_lst[iterEp] += self.mdp.discount**iterSt * reward
                count_triple[action, state, nextState] += 1
                count_double = np.sum(count_triple[action, state, :])
                mdp_tmp.T[action,
                          state, :] = count_triple[action,
                                                   state, :] / count_double
                mdp_tmp.R[action,
                          state] = (reward + (count_double - 1) *
                                    mdp_tmp.R[action, state]) / count_double
                [policy, V, iterId] = mdp_tmp.policyIteration(policy)
                state = nextState
                if (done):
                    break
        return [V, policy, cumu_reward_lst]
コード例 #15
0
def main(userNum = '59945701'):

    agent = MDP(path = 'data-mini', k = 3) #Create instance for MDP class
    agent.initializeMDP()                  #Initialize States, Actions, Probabilites and initial Rewards
    rewardEvolution = agent.policyIteration()                # The algorithm that solves MDP 
    recommendation = agent.recommend(userNum)  #Use recommendation function 
    evaluationRS = agent.evaluateRecommendationScore() #Evaluation score
    evaluationED = agent.evaluateDecayScore()           #Another evaluation score
    return recommendation, evaluationRS, evaluationED, userNum, rewardEvolution
コード例 #16
0
def graph_recommendation_score(scale=4, m=10, with_comparison=False):
    """
    Function to generate a graph for the recommendation score over a range of m for a set of k
    :param scale: the limit to which k should vary
    :param m: a parameter in recommendation score computation
    :param with_comparison: plot a random policy's graph
    :return: None
    """

    fig = plt.figure()
    k = [i + 1 for i in range(1, scale)]
    x = [i + 1 for i in range(m)]
    for j in k:
        y_recommendation = []
        y_recommendation_rand = []
        rs = MDP(path='data-mini', k=j)
        rs.load('mdp-model_k=' + str(j) + '.pkl')
        for i in x:
            if with_comparison:
                rs.initialise_mdp()
                y_recommendation_rand.append(
                    rs.evaluate_recommendation_score(m=i))
            y_recommendation.append(rs.evaluate_recommendation_score(m=i))

        plt.plot(x,
                 y_recommendation,
                 color=(0.2 + (j - 2) * 0.4, 0.4, 0.6, 0.6),
                 label="MC model " + str(j))
        plt.scatter(x,
                    y_recommendation,
                    color=(0.2 + (j - 2) * 0.4, 0.4, 0.6, 0.6))

        if with_comparison:
            plt.plot(x,
                     y_recommendation_rand,
                     color=(0.2, 0.8, 0.6, 0.6),
                     label="Random model, For m=" + str(m))
            plt.scatter(x, y_recommendation_rand)

        plt.xticks(x)
        plt.yticks([i for i in range(20, 100, 10)])

        for x1, y in zip(x, y_recommendation):
            text = '%.2f' % y
            plt.text(x1, y, text)

        if with_comparison:
            for x1, y in zip(x, y_recommendation_rand):
                text = '%.2f' % y
                plt.text(x1, y, text)

    fig.suptitle('Recommendation Score vs Prediction List size')
    plt.xlabel('Prediction List size')
    plt.ylabel('Score')
    plt.legend()
    plt.show()
コード例 #17
0
ファイル: visualization.py プロジェクト: tristan-ka/IBOAT_RL
 def __init__(self, hist_duration, mdp_step, time_step, action_size, batch_size, mean, std, hdg0, src_file,
              sim_time):
     self.mdp = MDP(hist_duration, mdp_step, time_step)
     self.action_size = action_size
     self.agent = PolicyLearner(self.mdp.size, action_size, batch_size)
     self.agent.load(src_file)
     self.wh = wind(mean, std, int(mdp_step / time_step))
     self.hdg0 = hdg0
     self.src = src_file
     self.sim_time = sim_time
コード例 #18
0
 def runMDP(self):
     mdp = MDP(self.config)
     print "TESTING"
     while mdp.renameThis():
         #print "Iterate"
         result_policy = mdp.iterate()
         #print "publish"
         #print "Will Continue?", mdp.renameThis()
     print result_policy
     self.policies = result_policy
     util.print_2d_map(self.grid)
コード例 #19
0
def task_2():    
    # simple 4 move
    res = np.zeros(num_episodes+1)
    a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=False, stochastic_wind=False)
    epsilon, alpha = 0.1, 0.5    
    for seed in range(num_trials):        
        episodes = a.sarsa(seed, num_episodes, epsilon, alpha)
        res += np.cumsum(np.array(episodes))
    res /= num_trials
    plot_one(res, 'Sarsa(0), 4 move agent', 't2.png')

    return res
コード例 #20
0
def task_4():
    # stochastic wind
    res = np.zeros(num_episodes+1)
    a = MDP((rows, cols), (3, 0), (3, 7), wind, -1, 1, king_moves=True, stochastic_wind=True)
    epsilon, alpha = 0.1, 0.5
    for seed in range(num_trials):        
        episodes = a.sarsa(seed, num_episodes, epsilon, alpha)
        res += np.array(episodes)
    res = np.cumsum(res)/num_trials
    plot_one(res, 'Sarsa(0), 8 move agent, stochastic wind', 't4.png')

    return res
コード例 #21
0
 def __init__(self):
     self._states = 1  #standaardwaarde, wordt later aangepast eens de omgeving bekend is
     self._mdp = MDP(1)
     self._qvalues = np.zeros((self._states, 4))
     self._vvalues = np.zeros(self._states)
     self._policy = np.ones((self._states, 4)) / 4
     self._learningRate = 0.8
     self._epsilonDecay = -0.005
     self._epsilon = 1.0
     self._epsilonMin = 0.01
     self._epsilonMax = 1.0
     self._count = 0
コード例 #22
0
ファイル: CS533-4.py プロジェクト: sharpau/CS533-4
def part_ii_evaluation():
    random_results_1 = []
    safe_results_1 = []
    range_results_1 = []
    random_results_2 = []
    safe_results_2 = []
    range_results_2 = []

    for i in range(1000):
        print i
        random_results_1.append(
            random_policy(Sim(MDP("parking_mdp_linear_rewards_n_10.txt"))))
        random_results_2.append(
            random_policy(Sim(MDP("parking_mdp_quad_rewards_n_10.txt"))))

        safe_results_1.append(
            safe_policy(Sim(MDP("parking_mdp_linear_rewards_n_10.txt")), 0.5))
        safe_results_2.append(
            safe_policy(Sim(MDP("parking_mdp_quad_rewards_n_10.txt")), 0.5))

        range_results_1.append(
            range_policy(Sim(MDP("parking_mdp_linear_rewards_n_10.txt")), 2,
                         8))
        range_results_2.append(
            range_policy(Sim(MDP("parking_mdp_quad_rewards_n_10.txt")), 2, 6))

    print average(random_results_1)
    print average(safe_results_1)
    print average(range_results_1)
    print average(random_results_2)
    print average(safe_results_2)
    print average(range_results_2)
コード例 #23
0
ファイル: app.py プロジェクト: bmontambault/mdp
def policy_iteration_example():
    
    
    size = 5
    state_rewards_dict = {(3,3):1, (0,0):1}
    blocked_states_list = [(2,3), (2,4), (2,2)]
    discount=.9
    
    np.random.seed(443209)
    policy = np.random.randint(0,4, size=size**2)
    mdp = MDP(state_rewards_dict, blocked_states_list,
                     discount, size=size, policy=policy)        

    value_table1 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list)
    policy_table1 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list, show_policy=True)
    
    mdp.values = mdp.evaluate_policy_values()
    intermediate_table = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list)
    
    mdp.policy_evaluation()
    value_table2 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list)
    
    mdp.policy_improvement()
    policy_table2 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list, show_policy=True)
    
    mdp.policy_evaluation()
    value_table3 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list)
    
    mdp.policy_improvement()
    policy_table3 = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list, show_policy=True)
    
    mdp.policy_iteration()
    value_table4 = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list)
    policy_table4 = make_grid_world(mdp.states, mdp.get_total_rewards(), mdp.policy, mdp.blocked_states_list, show_policy=True)
    
    state_rewards_list = [[list(k),v] for k,v in state_rewards_dict.items()]
    return render_template("policy_iteration_example.html",
                value_table1=value_table1,
                policy_table1=policy_table1,
                intermediate_table=intermediate_table,
                value_table2=value_table2,
                policy_table2=policy_table2,
                value_table3=value_table3,
                policy_table3=policy_table3,
                value_table4=value_table4,
                policy_table4=policy_table4,
                size=size,
                state_rewards_list=state_rewards_list,
                blocked_states_list=[list(s) for s in blocked_states_list],
                discount=discount,
                values=mdp.values.tolist(),
                policy=mdp.policy.tolist())
コード例 #24
0
    def generate_model(self):
        """
        Method to generate and save the various models.
        :return: None
        """

        # Generate models whose n-gram values change from 1...k
        for i in range(1, self.k+1):
            # Initialise the MDP
            mm = MDP(path=self.path, alpha=self.alpha, k=i,
                     discount_factor=self.df, verbose=self.verbose, save_path=self.save_path)
            mm.initialise_mdp()
            # Run the policy iteration and save the model
            mm.policy_iteration(max_iteration=1000)
コード例 #25
0
def create_environment_dynamic(n_lines, m_columns, n_walls, sr_plus, sr_less,
                               s_initial, discount):
    World.x = m_columns
    World.y = n_lines

    amb = empty([n_lines, m_columns], dtype=object)
    s = sr_plus.split(sep=".")
    amb[int(s[0]), int(s[1])] = "+1"

    World.specials.append((int(s[0]), int(s[1]), "green", 1))

    s = sr_less.split(sep=".")
    amb[int(s[0]), int(s[1])] = "-1"

    World.specials.append((int(s[0]), int(s[1]), "red", -1))

    s = s_initial.split(sep=".")
    amb[int(s[0]), int(s[1])] = "I"

    World.player = (int(s[0]), int(s[1]))
    World.initial_position = (int(s[0]), int(s[1]))

    while n_walls > 0:
        line = randint(0, n_lines - 1)
        col = randint(0, m_columns - 1)
        if amb[line, col] is None:
            amb[line, col] = "X"

            World.walls.append((line, col))

            n_walls -= 1
    for i in range(n_lines):
        for j in range(m_columns):
            if amb[i, j] is None or amb[i, j] == 'I':
                amb[i, j] = " "

    states = create_states_dynamic(amb, -0.04)
    print(amb)
    up = create_up(states)
    down = create_down(states)
    right = create_right(states)
    left = create_left(states)
    actions = [up, down, left, right]

    World.render_grid()
    World.create_player()
    # World.start_game()

    return MDP(states, actions, discount)
コード例 #26
0
ファイル: app.py プロジェクト: bmontambault/mdp
def get_mdp(request):
    
    data = json.loads(request.data)
    size = data['size']
    state_rewards_list = data['state_rewards_list']
    state_rewards_dict = {tuple(k):v for k,v in state_rewards_list}
    blocked_states_list = [tuple(s) for s in data['blocked_states_list']]
    discount = data['discount']
    
    values = np.array(data['values'])
    policy = np.array(data['policy'])
    
    mdp = MDP(state_rewards_dict, blocked_states_list,
                     discount, size, values, policy)
    return mdp
コード例 #27
0
    def __init__(self):
        self._mdp = MDP()
        self.policy3 = np.ones((16, 4)) / 4
        self._evaluation = Evaluation(mdp=self._mdp, policy=self.policy3)
        self._epsilon = 1.0
        self._epsilonMin = 0.01
        self._epsilonMax = 1.0
        self._epsilonDecay = -0.005
        self.policy2 = np.ones((16, 4)) / 4
        self.count = 0
        print("Rewards:")
        self.print_rewards(4, 4)
        print()

        # initial policy
        print("initial policy:")
        self.print_policy(4, 4)
        self.steps = 0
コード例 #28
0
def create_abstractMDP(mdp, aggregation):

    states = set(aggregation.keys())
    abstrans = dict()
    abstrans.update({(s, a): set()
                     for s in aggregation.keys() for a in mdp.alphabet})
    for absstate in states:
        for s in aggregation[absstate]:
            for a in mdp.available(s):
                for t in mdp.post(s, a):
                    for s2 in aggregation.keys():
                        if t in aggregation[s2]:
                            abstrans[absstate, a].add(s2)
    abstransprobs = set()
    for (s, a) in abstrans.keys():
        for t in abstrans[(s, a)]:
            abstransprobs.add((s, a, t, 1.0 / len(abstrans[(s, a)])))
    absmdp = MDP(states, mdp.alphabet, abstransprobs)
    return absmdp
コード例 #29
0
    def __init__(self, num_positions=500, num_orientations=10):
        # TODO: Interface with SLAM algorithm's published map
        # Initialize map.
        rospy.init_node(
            "neato_mdp")  # May break if markov_model is also subscribed...?
        rospy.wait_for_service("static_map")
        static_map = rospy.ServiceProxy("static_map", GetMap)
        # Initialize MDP
        self.mdp = MDP(num_positions=num_positions,
                       num_orientations=num_orientations,
                       map=static_map().map)
        self.state_idx = None  # Current state idx is unknown.
        self.curr_odom_pose = Pose()
        self.tf_helper = TFHelper()
        # Velocity publisher
        self.cmd_vel_publisher = rospy.Publisher("/cmd_vel",
                                                 Twist,
                                                 queue_size=10,
                                                 latch=True)
        self.odom_subscriber = rospy.Subscriber('/odom', Odometry,
                                                self.set_odom)
        self.goal_state = None
        # Visualize robot
        self.robot_state_pub = rospy.Publisher('/robot_state_marker',
                                               Marker,
                                               queue_size=10)
        self.robot_state_pose_pub = rospy.Publisher('/robot_state_pose',
                                                    PoseArray,
                                                    queue_size=10)
        self.goal_state_pub = rospy.Publisher('/goal_state_marker',
                                              Marker,
                                              queue_size=10)
        # # pose_listener responds to selection of a new approximate robot location (for instance using rviz)
        #
        self.odom_pose = PoseStamped()
        self.odom_pose.header.stamp = rospy.Time(0)
        self.odom_pose.header.frame_id = 'odom'
        #
        rospy.Subscriber("initialpose", PoseWithCovarianceStamped,
                         self.update_initial_pose)

        rospy.Subscriber("move_base_simple/goal", PoseStamped,
                         self.update_goal_state)
コード例 #30
0
ファイル: app.py プロジェクト: bmontambault/mdp
def policy_iteration_step():
        
    data = json.loads(request.data)
    size = data['size']
    state_rewards_list = data['state_rewards_list']
    state_rewards_dict = {tuple(k):v for k,v in state_rewards_list}
    blocked_states_list = [tuple(s) for s in data['blocked_states_list']]
    discount = data['discount']
    
    values = np.array(data['values'])
    policy = np.array(data['policy'])
    
    mdp = MDP(state_rewards_dict, blocked_states_list,
                     discount, size, values, policy)
    mdp.values = mdp.evaluate_policy_values()
    
    table = make_grid_world(mdp.states, mdp.values, mdp.policy, mdp.blocked_states_list)
    
    return json.dumps({'table': table, 'values': mdp.values.tolist(),
            'policy': mdp.policy.tolist()})