def experiment(test_game, num_experiments):
    """
    Main experiment method that runs the Value Iteration experiments and prints results
    works by learning a model x number of times.

    the average number of moves per policy is then created and averaged per experiment

    prints and returns the average number of episodes to reach the goal along with the learned policy.
    """

    average_number_of_moves_with_policy = []
    for x in range(num_experiments):
        # Learn Policy
        vi = ValueIteration(test_game)
        policy_and_num_iterations = vi.value_iteration()
        policy = policy_and_num_iterations[0]
        print(policy)

        avg_num_steps = 0
        for itter in range(100):
            num_steps = vi.execute_policy(policy)
            avg_num_steps += num_steps

        avg_num_steps /= 100.0

        average_number_of_moves_with_policy.append(avg_num_steps)

    total_average_num_steps = sum(average_number_of_moves_with_policy) / num_experiments
    print("Total Average Number of Steps: {}".format(total_average_num_steps))

    return total_average_num_steps
Esempio n. 2
0
def main():

    #read in results file
    results = open("results.txt","r")
    questions = []
    for line in results:
        line = line.replace("\n","")
        line = line.replace("\r","")
        questions.append(line.split(","))

    windows = []
    #do queries for each line in the file
    for q in questions:
        window = tk.Tk()
        grid = Grid('gridConf.txt')

        if(q[3] == "MDP"):
            valueIteration = ValueIteration(grid)
            grid = valueIteration.runValueIteration()
        elif(q[3] == "RL"):
            qValueLearning = QValueLearning(grid)
            grid = qValueLearning.runQValueLearning()

        gridPolicies = grid.get_policies_()
        terminal_states = grid.terminal
        boulder_states = grid.boulder

        answer = ""

        if(q[4] == "stateValue"):
            answer = grid.gridStates[int(q[1])][int(q[0])].get_max()
        elif(q[4] == "bestPolicy"):
            answer = grid.gridStates[int(q[1])][int(q[0])].getPolicy(0.0)[1]
        elif(q[4] == "bestQValue" and q[3] == "RL"):
            answer = grid.gridStates[int(q[1])][int(q[0])].getPolicy(0.0)[0]

        index = questions.index(q) + 1
        answer = "Question " + str(index) + ": " + ",".join(q) + ": " + str(answer)

        if(q[3] == "MDP"):
            draw_board(window, gridPolicies, [row[:-1] for row in terminal_states], boulder_states,
                max_reward(terminal_states), max_punishment(terminal_states), q[2], 'value-iteration', answer)
        elif(q[3] == "RL"):
            draw_board(window, gridPolicies, [row[:-1] for row in terminal_states], boulder_states,
               max_reward(terminal_states), max_punishment(terminal_states),  q[2], 'q-learning', answer)

        windows.append(window)

    #display all queries
    for window in windows:
        window.mainloop()
Esempio n. 3
0
    def __init__(self, params, env_map):
        self.world_params = params['world']
        self.robot_params = params['robot']
        self.sensor_params = params['sensor']
        self.num_particles = params['sim']['num_particles']
        self.goal_pos = params['sim']['goal_pos']
        self.goal_radius = params['sim']['goal_radius']
        self.map_belief = env_map
        self.state_belief = np.zeros(2)

        # Belief map
        self.timestep = 0
        self.w_slow = None
        self.w_fast = None

        l,w = self.map_belief.shape        

        # Discretized Actions
        self.actions = [{'dturn':dt, 'dmove':dm} for dt in [-np.pi/4, 0, np.pi/4] for dm in [2,-2]]

        # Value map from value iteration algorithm
        trans_sim = lambda state, act : calc_move(self.robot_params, state, act, self.map_belief)
        self.vi = ValueIteration(env_map, trans_sim, self.actions, self.reward)
        self.vi.solve()


        # Particle filter
        # Each row represents one particle of (pos_x, pos_y, orient)
        self.particles = self.gen_rand_particles(self.num_particles)
from GridWorld import GridWorld
from GridWorld import GridWorldAdditive
from ValueIteration import ValueIteration

# Run Value Iteration in different Grid World environments
if __name__ == "__main__":
    gamma = 0.9
    print("Grid world Value Iteration with discounted rewards gamma = %.2f\n" % gamma)
    terminals = {(0, 3): +1, (1, 3): -1}
    gw = GridWorld((3, 4), 0.8, [(1, 1)], terminals)
    vi = ValueIteration()
    values = vi.valueIteration(gw, gamma)
    gw.printValues(values)
    qvalues = vi.getQValues(gw, values, gamma)
    gw.printQValues(qvalues)
    policy = vi.getPolicy(gw, values, gamma)
    gw.printPolicy(policy)

    reward = -0.01
    print("Grid world Value Iteration with additive rewards = %.2f\n" % reward)
    gwa = GridWorldAdditive((3, 4), 0.8, [(1, 1)], terminals, reward)
    values = vi.valueIteration(gwa, 1, 100)
    gwa.printValues(values)
    qvalues = vi.getQValues(gwa, values, 1)
    gwa.printQValues(qvalues)
    policy = vi.getPolicy(gwa, values, 1)
    gwa.printPolicy(policy)
 
    reward = -0.04
    print("Grid World with additive rewards = %.2f\n" % reward)
    gwa = GridWorldAdditive((3, 4), 0.8, [(1, 1)], terminals, reward)
    print 'Generating map', outer, '(', configurations, 'configuations )'
    sys.stdout.flush()

    world = info['map']
    rewards = info['rewards']
    terminal = info['terminal']
    instructions = info['instructions']
    values = []

    sprite = SpriteWorld(library.objects, library.background)
    sprite.makeGrid(world, args.vis_path + str(outer) + '_sprites')

    for inner in tqdm(range(configurations)):
        reward_map = rewards[inner]
        terminal_map = terminal[inner]
        instr = instructions[inner]

        mdp = MDP(world, reward_map, terminal_map)
        vi = ValueIteration(mdp)

        values_list, policy = vi.iterate()
        value_map = mdp.representValues(values_list)
        values.append(value_map)

        # visualize_values(mdp, values_list, policy, args.vis_path + str(outer) + '_' + str(inner) + '_values', title=instr)

    info['values'] = values
    filename = os.path.join(args.save_path, str(outer) + '.p')
    pickle.dump(info, open(filename, 'wb'))
Esempio n. 6
0
        self.display_mode_on = False

    def turn_on_display(self):
        self.display_mode_on = True


code_to_decode = "RBGY"

clock = pygame.time.Clock()

mastermind = Mastermind(code_to_decode)
mastermind.reset()

#Value Iteration
done = False
optimal_policy, optimal_value = ValueIteration(mastermind, 0.85, 0.00000000000000001)
state = mastermind.step("YGBR")[0]
while not done:
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            done = True

    '''
    try according to the policy
    '''
    actions = []
    probs = []

    for action, prob in optimal_policy[state].items():
        actions.append(action)
        probs.append(prob)
Esempio n. 7
0
def ValueIteration_Rtest():
    VI = ValueIteration(0.00000000001, 0.5, "R")
    VI.valueIteration(0.00000000001, 0.5)
    for i in range(10):
        print(VI.trial_run())
Esempio n. 8
0
def VI_R_reset():
    VI = ValueIteration(0.00000000001, 0.5, "R", restart=True)
    VI.valueIteration(0.00000000001, 0.5)
    for i in range(10):
        print(VI.trial_run())
Esempio n. 9
0
GAMMA = 0.9

if __name__ == "__main__":
    # Command line parser
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "input_file", help="The name of the file to treat as the search space")
    parser.add_argument("--epsilon",
                        help="epsilon for value iteration",
                        type=float,
                        default=0.5)
    args = parser.parse_args()

    # open file
    f = open(args.input_file, 'r')
    g = Graph(WIDTH, HEIGHT)

    # Create our graph structure to traverse
    create_graph_from_file(f, g)

    # Create and perform A* search

    v = ValueIteration(g, (0, 0), (WIDTH - 1, HEIGHT - 1), GAMMA, args.epsilon)

    util = v.run()
    v.set_utils(util)

    path = v.trace_path()
    # Print path to see verbose information about each node on the path
    #print path
Esempio n. 10
0
 def getQValues(self, env):
     vi = ValueIteration()
     values = vi.valueIteration(env)
     qvalues = vi.getQValues(env, values)
     return qvalues
Esempio n. 11
0
class Player_vi:
    
    def __init__(self, params, env_map):
        self.world_params = params['world']
        self.robot_params = params['robot']
        self.sensor_params = params['sensor']
        self.num_particles = params['sim']['num_particles']
        self.goal_pos = params['sim']['goal_pos']
        self.goal_radius = params['sim']['goal_radius']
        self.map_belief = env_map
        self.state_belief = np.zeros(2)

        # Belief map
        self.timestep = 0
        self.w_slow = None
        self.w_fast = None

        l,w = self.map_belief.shape        

        # Discretized Actions
        self.actions = [{'dturn':dt, 'dmove':dm} for dt in [-np.pi/4, 0, np.pi/4] for dm in [2,-2]]

        # Value map from value iteration algorithm
        trans_sim = lambda state, act : calc_move(self.robot_params, state, act, self.map_belief)
        self.vi = ValueIteration(env_map, trans_sim, self.actions, self.reward)
        self.vi.solve()


        # Particle filter
        # Each row represents one particle of (pos_x, pos_y, orient)
        self.particles = self.gen_rand_particles(self.num_particles)


    def reward(self, x, y):
        goalx, goaly = self.goal_pos
        dist = np.sqrt((x - goalx)**2 + (y - goaly)**2)
        # Add penalty for being close to walls
        lst = [self.map_belief[x + dx, y + dy] for dx in [-1,0,1] for dy in [-1,0,1]]
        if sum(lst) > 0:
            wall_penalty = -100
        else:
            wall_penalty = 0
        if dist <= self.goal_radius:
            return 1000
        else:
            return -1 + wall_penalty

    # Generate matrix of random particles (num x state dimensions)
    def gen_rand_particles(self,num_particles):
        l,w = self.map_belief.shape        
        particles = np.random.rand(num_particles,3)
        particles[:,0] *= l
        particles[:,1] *= w
        particles[:,2] *= np.pi * 2
        return particles

    def pred_step(self, action):
        # Update Particles in prediction step
        tmp = np.zeros(self.particles.shape)
        for i in np.arange(self.num_particles):
            state = self.particles[i,:]
            tmp[i,:] = calc_move(self.robot_params, state, action, self.map_belief)
        self.particles = tmp

    # Calculate updated beliefs based on sensor readings
    # Must be issue in measurement model - not giving good results for p(z|x)
    def update_belief(self, sensor_readings):

        alpha_slow = 0.05
        alpha_fast = 0.5

        # Measurement Update step
        sensor_type = 'radar' # would need to extend for addition sensors
        actual_m = sensor_readings['radar']
        weights = np.zeros(self.num_particles)
        for i in np.arange(self.num_particles):
            pred_state = self.particles[i,:]
            pred_readings = read_sensor(self.sensor_params, pred_state, self.map_belief)
            logprob_m = 0
            for j in np.arange(len(actual_m)):
                # Conditional prob of pred_m given actual_m - predict with scaled up sensor noise
                logprob_m += norm.logpdf(x=pred_readings[j], loc=actual_m[j], 
                                         scale=self.sensor_params['noise_sigma'] * 10)
            weights[i] = np.exp(logprob_m)
        
        # Short and long term prob trends
        w_avg = np.mean(weights)
        if not self.w_slow and not self.w_fast:
            self.w_slow = w_avg
            self.w_fast = w_avg
        else:
            self.w_slow = self.w_slow + alpha_slow * (w_avg - self.w_slow)
            self.w_fast = self.w_fast + alpha_fast * (w_avg - self.w_fast)

        print (self.w_fast, self.w_slow, 1 - self.w_fast/self.w_slow)

        # Renormalize weights
        weights = weights / np.sum(weights)
        
        # Resample particles based on new weights
        sample_idx = np.random.choice(np.arange(self.num_particles),
                                      p=weights, size=self.num_particles)
        self.particles = self.particles[sample_idx,:]

        # Random sample based on long term trends
        rand_ind = np.random.rand(self.num_particles) < (1 - self.w_fast/self.w_slow)
        self.particles[rand_ind,:] = self.gen_rand_particles(np.sum(rand_ind))
    
    # Return action of a robot given map of the environment
    # action is a dictionary object with the following keys:
    #       dturn : degrees (radians) to turn robot
    #       dmove : value to move forward
    def get_action(self, readings):
        # Find ML estimate of position from particles

        # discretize action set

        act_vals = np.zeros((self.num_particles, len(self.actions)));
        for i in range(self.num_particles):
            for j in range(len(self.actions)):
                state = self.particles[i,:]
                action = self.actions[j]
                [nsx,nsy,nso] = calc_move(self.robot_params, state, action, self.map_belief)
                # lookup value at new state
                act_vals[i,j] = self.vi.get_val((nsx,nsy,nso))
        # Take greedy action with the highest expected value
        sum_vals = np.sum(act_vals,axis=0)
        max_ind = np.argmax(sum_vals)
        return self.actions[max_ind]


    # Returns the belief state of the robot
    def get_bstate(self):
        state = {'world':self.map_belief}
        state['robots'] = {0:Car(self.robot_params, [15,15,0])}
        state['particles'] = self.particles
        return state