Ejemplo n.º 1
0
    def __init__(self):
        """Setup ROS things"""
        
        rospy.init_node('robot')
       

        self.simComplete_publisher = rospy.Publisher(
			"/map_node/sim_complete",
			Bool,
			queue_size = 10
		)
        
        rospy.sleep(1)

        #call Astar here
        #Astar()
        
        #call MDP here
        Mdp()

        rospy.sleep(1)

        self.simComplete_publisher.publish(True)

        rospy.sleep(1)

        rospy.signal_shutdown("shutting down")
Ejemplo n.º 2
0
    def __init__(self, original_mdp, product_sta, product_lab, policy_tra, probs_vect, progs_vect, exp_times_vect):
        Mdp.__init__(self)
        self.original_mdp=original_mdp
        self.n_state_vars=original_mdp.n_state_vars+1
        self.state_vars=list(original_mdp.state_vars)
        self.state_vars.append("_da")
        self.state_vars_range=dict(original_mdp.state_vars_range) #ranges for the state vars
        self.initial_state=dict(original_mdp.initial_state) #dict indexed by the state vars
        self.n_props=original_mdp.n_props #number of propositional labels
        self.props=list(original_mdp.props)
        self.props_def=dict(original_mdp.props_def) #dict of MdpPropDef instances. keys are the propositional labels names
        self.n_actions=original_mdp.n_actions #number of actions
        self.actions=list(original_mdp.actions) #list of action names
        self.transitions=[] #list of MdpTransitionDef instances: won't be filled for the policy, as we will work with the flat representations
        self.reward_names=list(original_mdp.reward_names)
        
        #read sta product file to get flat state descriptions and number of dfa states
        self.n_aut_states=0
        self.n_flat_states=0
        self.flat_state_defs={}
        self.read_prod_state_file(product_sta)
        self.state_vars_range["_da"]=[0, self.n_aut_states]
        
        #read lab product file to get initial and accepting states
        self.initial_flat_state=-1
        self.acc_flat_states=set()
        self.set_init_and_acc_states(product_lab)
        
        
        self.current_flat_state = None
        self.current_state_def = None
        self.set_current_state(self.initial_flat_state)

        
        self.flat_state_policy={} #self.flat_state_policy[flat_state]=action to execute in flat_state
        self.flat_state_sucs={} #self.flat_state_sucs[flat_state]=list of possible flat state successors, e.g., [20,25]
        self.flat_state_suc_probs={} #self.flat_state_probs[flat_state]=list of probs associated to the corresponding flat_state_sucs, e.g., [0.7,0.3]
        self.transitions=[] #not being set for efficiency. The flat representations above are easier to build and to use for execution. only needed for exporting of the policy
        self.set_policy_flat(policy_tra)
        
        self.guarantees_probs=self.read_vect(probs_vect)
        self.guarantees_progs=self.read_vect(progs_vect)
        self.guarantees_times=self.read_vect(exp_times_vect)
Ejemplo n.º 3
0
def main():
    mdp = Mdp()
    policy_value = PolicyValue(mdp)
    policy_value.iterate_policy(mdp)
    print 'value:'
    for i in xrange(1, 6):
        print '%d:%f\t' % (i, policy_value.v[i])
        print ''
    for i in xrange(1, 6):
        print policy_value.pi[i]
Ejemplo n.º 4
0
def main():
    mdp = Mdp()
    policy_value = PolicyValue(mdp)
    policy_value.iterate_value(mdp)
    print "value:"
    for i in xrange(1, 6):
        print "%d:%f\t" % (i, policy_value.v[i]),
    print ""

    print "policy:"
    for i in xrange(1, 6):
        print "%d->%s\t" % (i, policy_value.pi[i]),
    print ""
Ejemplo n.º 5
0
def compute_random_pi_state_value():
    value = [0.0 for r in xrange(9)]
    #大数模拟求均值
    num = 100000
    
    for k in xrange(1,num):
        for i in xrange(1,6):
            mdp = Mdp()
            s = i
            is_terminal = False
            gamma = 1.0
            v = 0.0
            while False == is_terminal:
                a = random_pi()
                is_terminal,s,r = mdp.transform(s,a)
                v += gamma * r
                gamma *= mdp.gamma
            
            value[i] = (value[i] * (k-1) + v) / k
        if k % 10000 == 0:
            print value[1:9]
    print value[1:9]
Ejemplo n.º 6
0
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 29 10:56:37 2017

@author: Administrator
"""

from mdp import Mdp

mdp = Mdp()
states = mdp.get_states()
actions = mdp.get_actions()
gamma = mdp.get_gamma()


def mc(gamma, state_sample, action_sample, reward_sample):
    vfunc = dict()
    nfunc = dict()
    for state in states:
        vfunc[state] = 0.0
        nfunc[state] = 0.0

    for i in xrange(len(state_sample)):
        G = 0.0
        for step in xrange(len(state_sample[i]) - 1, -1, -1):
            G *= gamma
            G += reward_sample[i][step]
        for step in xrange(len(state_sample[i])):
            s = state_sample[i][step]
            vfunc[s] += G
            nfunc[s] += 1.0
Ejemplo n.º 7
0
    def __init__(self,
                 width,
                 height,
                 hit=False,
                 walls=[],
                 action_list=[],
                 nb_actions=4,
                 gamma=0.9,
                 timeout=50,
                 start_states=[0],
                 terminal_states=[]):
        # width, height : int numbers defining the maze attributes
        # walls : list of the states that represent walls in our maze environment
        # action_list : list of possible actions
        # nb_actions : used when action_list is empty, by default there are 4 of them (go north, south, eat or west)
        # gamma : the discount factor of our mdp
        # timeout : defines the length of an episode (max timestep) --see done() function
        # start_states : list that defines the states where the agent can be at the beginning of an episode
        # terminal_states : list that defines the states corresponding to the end of an episode
        #                  (agent reaches a terminal state) --cf. done() function
        self.width = width
        self.height = height
        self.cells = np.zeros((width, height), int)
        self.walls = walls
        self.size = width * height
        state = 0
        cell = 0

        self.terminal_states = terminal_states
        self.state_width = []
        self.state_height = []
        # ##################### State Space ######################
        for i in range(width):
            for j in range(height):
                if cell not in walls:  # or self.cells[i][j] in self.terminal_states):
                    self.cells[i][j] = state
                    state = state + 1
                    self.state_width.append(i)
                    self.state_height.append(j)
                else:
                    self.cells[i][j] = -1
                cell = cell + 1

        self.nb_states = state

        # ##################### Action Space ######################
        self.action_space = SimpleActionSpace(action_list=action_list,
                                              nactions=nb_actions)

        # ##################### Distribution Over Initial States ######################

        start_distribution = np.zeros(
            self.nb_states)  # distribution over initial states

        # supposed to be uniform
        for state in start_states:
            start_distribution[state] = 1.0 / len(start_states)

        # ##################### Transition Matrix ######################

        # a "well" state is added that only the terminal states can get into
        transition_matrix = np.empty(
            (self.nb_states + 1, self.action_space.size, self.nb_states + 1))

        # Init the transition matrix
        transition_matrix[:, N, :] = np.zeros(
            (self.nb_states + 1, self.nb_states + 1))
        transition_matrix[:, S, :] = np.zeros(
            (self.nb_states + 1, self.nb_states + 1))
        transition_matrix[:, E, :] = np.zeros(
            (self.nb_states + 1, self.nb_states + 1))
        transition_matrix[:, W, :] = np.zeros(
            (self.nb_states + 1, self.nb_states + 1))

        for i in range(self.width):
            for j in range(self.height):
                state = self.cells[i][j]
                if not state == -1:

                    # Transition Matrix when going north (no state change if highest cells or cells under a wall)
                    if j == 0 or self.cells[i][j - 1] == -1:
                        transition_matrix[state][N][state] = 1.0
                    else:  # it goes up
                        transition_matrix[state][N][self.cells[i][j - 1]] = 1.0

                    # Transition Matrix when going south (no state change if lowest cells or cells above a wall)
                    if j == self.height - 1 or self.cells[i][j + 1] == -1:
                        transition_matrix[state][S][state] = 1.0
                    else:  # it goes down
                        transition_matrix[state][S][self.cells[i][j + 1]] = 1.0

                    # Transition Matrix when going east (no state change if left cells or on the left side of a wall)
                    if i == self.width - 1 or self.cells[i + 1][j] == -1:
                        transition_matrix[state][E][state] = 1.0
                    else:  # it goes left
                        transition_matrix[state][E][self.cells[i + 1][j]] = 1.0

                    # Transition Matrix when going west (no state change if right cells or on the right side of a wall)
                    if i == 0 or self.cells[i - 1][j] == -1:
                        transition_matrix[state][W][state] = 1.0
                    else:  # it goes right
                        transition_matrix[state][W][self.cells[i - 1][j]] = 1.0

        # Transition Matrix of terminal states
        well = self.nb_states  # all the final states' transitions go there
        for s in self.terminal_states:
            transition_matrix[s, :, :] = 0
            transition_matrix[s, :, well] = 1

        if hit:
            reward_matrix = self.reward_hit_walls()
        else:
            reward_matrix = self.simple_reward()

        plotter = MazePlotter(self)  # renders the environment

        self.mdp = Mdp(self.nb_states,
                       self.action_space,
                       start_distribution,
                       transition_matrix,
                       reward_matrix,
                       plotter,
                       gamma=gamma,
                       terminal_states=terminal_states,
                       timeout=timeout)
Ejemplo n.º 8
0
    'hold0',  # index 6
    'hold1',  # index 7
    'hold2'
]  # index 8
trading_rule = trading_rules[5]  # 222

# type of reinforcement learning method
transaction_cost = 0
rl1 = rlm.Rl_linear(transaction_cost, epsilon, r_t, N, M, method_type,
                    alpha_linear, gamma, random_init)
mean = 0.00
sigma = 0.01
rl2 = rlm.Rl_full_matrix(transaction_cost, epsilon, r_t, N, M, method_type,
                         alpha_grid, gamma, random_init, mean, sigma)
no_trade_reward = 0
mdp = Mdp(rl2, r_t, L, transaction_cost, no_trade_reward, trading_rule)

# computes return
actions = []
equity_lines = []
start = max(N, L, M)
end = T_max - L

for iter in range(iterations_nb):
    state = mdp.reset(start)
    for t in range(start, end):
        # exploration exploitation
        if np.random.rand() < epsilon:
            action_t = np.random.randint(-1, 2)
        else:
            #action_t = mdp.rl_method.next_action()
Ejemplo n.º 9
0
from mdp import Mdp
from parser_mdp import Parser
import glob

files = glob.glob('DeterministicGoalState/*') + glob.glob('RandomGoalState/*')

for file in files[11:12]:
    navigationFile = open(file)

    navigationFileReaded = navigationFile.read()

    navigationFileParsed = Parser(navigationFileReaded)

    states = navigationFileParsed.get_states()

    policy_lao, time_lao = Mdp(states).lao_star()

    print("LAO, " + file.split('\\')[0] + ', ' + file.split('\\')[1] + ", " +
          str(round(time_lao, 2)))

    policy_iteration, time_iteration = Mdp(states).value_iteration()

    print("ITER, " + file.split('\\')[0] + ', ' + file.split('\\')[1] + ", " +
          str(round(time_iteration, 2)))