Esempio n. 1
0
def main():

    # initialization of the grid
    grid = Grid()

    # creating markov decision and q learning classes for using algorithms
    mdp = MarkovDecision()
    ql = QLearning()

    print("\n")
    print("Value iteration results: \n")
    # parameter order: p, reward, discount
    utilities, policies = mdp.iterateValues(grid, 1, 0, 1)
    grid.printPolicies()

    grid.resetGrid()

    print("\n")
    print("Policy iteration results: \n")
    # parameter order: p, reward, discount
    mdp.iteratePolicies(grid, 1, 0, 1)
    grid.printPolicies()

    grid.resetGrid()

    print("\n")
    print("Q Learning results: \n")
    #parameter order: grid, p, reward, learning_rate, discount, epsilon, iteration, value_iteration results
    ql.learnQ(grid, 1, 0, 0.1, 1, 0, 1000, utilities, policies)
    grid.printQValues()
    grid.printQLearningResults()
Esempio n. 2
0
    def __init__(self):

        # rospy.init_node('robot_action')

        # Initialize publishers and subscribers
        rospy.on_shutdown(self.shutdown)
        self.bridge = cv_bridge.CvBridge()
        self.action_sub = rospy.Subscriber('/q_learning/robot_action',
                                           RobotMoveDBToBlock,
                                           self.action_received)
        self.camera_sub = rospy.Subscriber('/camera/rgb/image_raw', Image,
                                           self.camera_received)
        self.scan_sub = rospy.Subscriber('/scan', LaserScan,
                                         self.scan_received)
        self.vel_pub = rospy.Publisher('cmd_vel', Twist, queue_size=1)
        self.action_pub = rospy.Publisher('/q_learning/robot_action',
                                          RobotMoveDBToBlock,
                                          queue_size=3)

        # Keras pipeline
        self.pipeline = keras_ocr.pipeline.Pipeline()

        # Variables for the scan and image results
        self.current_scan = []
        self.current_img = []

        # Import the Q-Learning class functions
        self.q_learning = QLearning()

        rospy.sleep(1)

        # Set color ranges for red, green, and blue
        self.color_info = {  #HSV upper and lower bounds for each color (need to test these ranges work)
            'red': {
                'lower': np.array([0, 250, 150]),
                'upper': np.array([10, 255, 180])
            },
            'blue': {
                'lower': np.array([110, 250, 150]),
                'upper': np.array([130, 255, 190])
            },
            'green': {
                'lower': np.array([45, 250, 150]),
                'upper': np.array([75, 255, 190])
            }
        }

        # Initialize robot arm components
        self.move_group_arm = moveit_commander.MoveGroupCommander("arm")
        self.move_group_gripper = moveit_commander.MoveGroupCommander(
            "gripper")

        # Set initial arm position
        self.move_group_arm.go([0.0, 0.9, -0.3, -0.79], wait=True)
        self.move_group_gripper.go([0.009, 0.009], wait=True)

        # Get the action sequence to maximize reward
        self.extract_action()
Esempio n. 3
0
def main(argv):
    setpath()
    from q_learning import QLearning
    from grid_world import Grid

    # 5 rows, 4 cols, unreachables : [(1,1), (1,3)], pits : [(3,1)], goal : (4,3)
    g = Grid(5, 4, [(1, 1), (1, 3)], [(3, 1)], (4, 3))
    q = QLearning(g)
    q.learn()
Esempio n. 4
0
def create_q_learning_solver(labyrinth):
    # Setting up the q-learning R and Q matrixes
    q_learning_config_dict = {
    'labyrinth': labyrinth,
    'learning_rate': constant.STD_LEARNING_RATE,
    'exploration_chance': constant.STD_EXPLORATION_CHANCE
    }
    q_learning_solver = QLearning(q_learning_config_dict)
    return q_learning_solver
Esempio n. 5
0
def run_games(game_length, left_arm_mean, left_arm_std, n_players, right_arm_mean, right_arm_std, use_asrn, learning_rate = 0.01, gamma=0.95, epsilon=1.0, epsilon_decay=0.99):
    all_rewards = []
    all_goods = []
    all_losses = []
    all_q_tables = []
    trained_agent_q_values = [left_arm_mean / (1 - gamma), right_arm_mean / (1 - gamma)]
 
    for j in range(n_players):
        two_armed_bandit = BrokenArmedBandit(left_arm_mean=left_arm_mean, right_arm_mean=right_arm_mean, left_arm_std=left_arm_std, right_arm_std=right_arm_std)

        ## giving the real mean as initialization(!)
        left_initial_mean = trained_agent_q_values[0]
        right_initial_mean = trained_agent_q_values[1]

        q_learning = QLearning(left_initial_mean, right_initial_mean, learning_rate, gamma, epsilon, epsilon_decay)

        rewards = np.zeros((game_length, 1))
        goods = np.zeros((game_length, 1))
        losses = np.zeros((game_length, 1))
        q_table = []

        if use_asrn:
            asrn = BinsASRN(0, learning_period=game_length/10)
        for i in range(game_length):
            right, reward_estimation = q_learning.choose()
            good = q_learning.right_mean > q_learning.left_mean
            goods[i] = good

            q_table.append([q_learning.right_mean, q_learning.left_mean])
            
            reward = two_armed_bandit.pull(right)
            rewards[i] = reward

            if use_asrn:
                if right:
                    updated_right_mean = (1 - q_learning.learning_rate) * q_learning.right_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.right_mean)
                    reward = asrn.noise(q_learning.right_mean, updated_right_mean, reward)
                else:
                    updated_left_mean = (1 - q_learning.learning_rate) * q_learning.left_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.left_mean)
                    reward = asrn.noise(q_learning.left_mean, updated_left_mean, reward)

            loss = q_learning.update(right, reward)
            losses[i] = loss

        all_rewards.append(rewards)
        all_goods.append(goods)
        all_losses.append(losses)
        all_q_tables.append(q_table)

    return all_q_tables, all_rewards, all_goods, np.asarray(all_losses)
Esempio n. 6
0
def q_learning_greedy_probability_policy():

    env = gym.make('FrozenLake-v0')
    q_learning = QLearning(env.action_space.n,
                           env.observation_space.n,
                           epsilon=0.1,
                           learning_rate=0.1)
    q_learning.set_general_state_action_values([0.5, 1, 0.5, 0.5])
    episode_rewards = []
    all_over_reward = 0.0
    for i_episode in range(7000):

        # We start a new episode with have to reset the environment and stats
        observation = env.reset()
        accumulated_reward = 0.0

        for t in range(100):

            # Show current state
            # env.render()

            # Choose action based on current experience
            action = q_learning.greedy_probability_policy(observation)

            # Save previous state, and commit action, resulting new current state
            previous_observation = observation
            observation, reward, done, info = env.step(action)

            # Accumulate more reward
            accumulated_reward += reward

            # Train algorithm based on new experience
            q_learning.update_state_action_function(previous_observation,
                                                    action, reward,
                                                    observation)

            #
            if done:
                print "Episode finished after {} timesteps".format(t + 1)
                print "Total reward for episode %i: %i" % (i_episode,
                                                           accumulated_reward)
                all_over_reward += accumulated_reward
                episode_rewards.append(accumulated_reward)
                break

    plot = Plot()
    plot.plot_evolution(episode_rewards)
    print q_learning.q_table
    q_learning.write_q_function_dump()
Esempio n. 7
0
 def __init__(self, env, gamma, alpha):
     # sets self.env = env, state = None, next_waypoint = None, and a default color
     super(LearningAgent, self).__init__(env)
     self.color = 'red'  # override color
     # simple route planner to get next_waypoint
     self.planner = RoutePlanner(self.env, self)
     # Initialize additional variables and QLearning here
     self.actions = ['forward', 'left', 'right', None]
     self.states = []
     self.q_learn = QLearning(self.actions,
                              states=self.states,
                              gamma=gamma,
                              alpha=alpha)
     self.p_action = None
     self.p_reward = None
Esempio n. 8
0
def main():
    env = Env()
    q_learning_agent = QLearning()
    if len(load_table_name) > 20:
        print(f'loading table {load_table_name}')
        q_learning_agent.load_table(load_table_name)
    print(env.check_car_pos())
    assert env.check_car_pos() == (269, 116)
    updater = QLearningUpdater(q_learning_agent)
    epsilon_greedy = EpsilonGreedyAgent(q_learning_agent)
    if learn:
        rewards = train_agent(env, epsilon_greedy, q_learning_agent, updater)
        np.save('q_l_tables/rewards.npy', rewards)
    else:
        print('exploiting')
        play_agent(env, q_learning_agent)
Esempio n. 9
0
def greedy_policy():

    env = gym.make('FrozenLake-v0')
    q_learning = QLearning(env.action_space.n, env.observation_space.n)
    q_learning.set_general_state_action_values([0.5, 1, 0.5, 0.5])

    for i_episode in range(20):
        observation = env.reset()
        for t in range(100):
            env.render()
            print(observation)
            action = q_learning.greedy_policy(observation)
            observation, reward, done, info = env.step(action)
            if done:
                print "Episode finished after {} timesteps".format(t + 1)
                break
Esempio n. 10
0
def run_tests():
    """ Runs all tests defined in task2_tests.yaml.

        Returns:
            results: Results dataframe.

        """
    with open(FILENAME) as file:
        # Loads testing parameters from the yaml file.
        tests = yaml.safe_load(file)

    # create a dataframe to keep the results
    test_dict = tests['Tests']
    results = pd.DataFrame(test_dict)
    results['Last Average Score'] = ""
    results['No of Q-Learning episodes'] = ""

    # run experiments:
    for i, test in enumerate(test_dict):
        grid = Rooms(test["env_size"], testing=True)
        learning = QLearning(grid, test["gamma"], test["alpha"],
                             test["agent_start_pos"])
        e_greedy = Policy("e-greedy", test["epsilon"], test["decay"])
        greedy = Policy(policy_type="greedy")
        experiment = Experiments(grid, learning, greedy, test["iters"],
                                 test["agent_start_pos"], test["test_no"])

        for session in range(test["iters"]):
            learning.run_multiple_episodes(test["batch_episodes"], e_greedy)
            mean_reward = experiment.run_experiments(test["exp_per_batch"])

        results.loc[i, 'Last Average Score'] = mean_reward
        results.loc[i,
                    'No of Q-Learning episodes'] = (session +
                                                    1) * test["batch_episodes"]

        # save results to csv file
        filename = 'results/' + 'test_table.csv'
        results.to_csv(filename)

        # plot & save graphs
        experiment.generate_results(test["test_no"], test)

    return results
Esempio n. 11
0
def main(algorithm, track, x_start, y_start, discount, learning_rate, threshold, max_iterations, epsilon=None, reset_on_crash=False):
    """
    Program entry. Runs selected algorithm on selected track, at given coordinates, with given parameters
    :param algorithm: String
    :param track: List
    :param x_start: Int
    :param y_start: Int
    :param discount: Float
    :param learning_rate: Float
    :param threshold: Float
    :param max_iterations: Int
    :param epsilon: Float
    :param reset_on_crash: Boolean
    :return: None
    """
    with open(track) as f:
        specs = f.readline().strip().split(',')
        rows = int(specs[0])
        cols = int(specs[1])
        layout = f.read().splitlines()

        initial_state = (x_start, y_start, 0, 0)
        initial_action = (0, 0)

        agent = Car(initial_action, epsilon)
        environment = RaceTrack(rows, cols, layout, initial_state, reset_on_crash=reset_on_crash)

        if algorithm == 'value_iteration':
            value_iterator = ValueIteration(discount, threshold, max_iterations, environment, agent)
            value_iterator.run()
            path = value_iterator.extract_policy(initial_state)
            value_iterator.plot_max_diffs()
        elif algorithm == 'q_learning':
            q_learner = QLearning(discount, learning_rate, threshold, max_iterations, environment, agent)
            path = q_learner.run()
            q_learner.plot_avg_cost()
        elif algorithm == 'sarsa':
            sarsa = Sarsa(discount, learning_rate, threshold, max_iterations, environment, agent)
            path = sarsa.run()
            sarsa.plot_avg_cost()
        else:
            print("No algorithm selected")
            return None
        draw_track(path, layout)
    def __init__(self):
        # Publish Q-matrix updates to Q-matrix topic
        self.q_matrix_pub = rospy.Publisher("/RobotFinalProject/QMatrix",
                                            QMatrix,
                                            queue_size=10)

        # Publish to /robot_action for training the Q-matrices
        self.q_learning = QLearning()
        self.robot_action_pub = rospy.Publisher(
            "/RobotFinalProject/robot_action", RobotTasksDoors, queue_size=10)

        # Subscribe to environment to receive reward updates
        self.reward = rospy.Subscriber("/RobotFinalProject/reward",
                                       QLearningReward, self.update_q_matrix)
        rospy.sleep(2)

        # Initialize Q-matrices, one for each robot
        self.q_matrix_msg_a = QMatrix()
        self.q_matrix_msg_b = QMatrix()

        # For ease of use in training, we'll store the Q-matrices in a numpy array
        self.q_matrix_arr_a = np.zeros((128, 7))
        self.q_matrix_arr_b = np.zeros((128, 7))

        # Keep track of the 5 most recently updated Q-values per Q-matrix
        self.q_history_a = []
        self.q_history_b = []

        # Keeps track of current (starting) state, initialized to 0th state. This state accounts for both robots
        self.current_state = 0

        # Get new state and action from starting state
        new_state, action = self.get_random_action(self.current_state)

        # Current action is action from above (action from current state to new state)
        self.current_action = action

        # Keep track of new state
        self.new_state = new_state

        # Move the robot according to the current action
        self.complete_task_or_door(self.current_action)
Esempio n. 13
0
    def __init__(self, simulation_run, seed: int):
        logger.debug("Start initializing this: %s", self.__class__)
        # self.SimulationManager = simulation_run.SimulationManager
        sim_manager = simulation_run.SimulationManager
        self.market_size = sim_manager.marketSize
        self.competition_type = sim_manager.competition_type
        self.examination_interval_size = sim_manager.sizeOfExaminationInterval

        self.price = 0.0
        self.quantity = 0.0
        self.profit = 0.0
        self.qlearning = QLearning(seed, sim_manager)                # holds the firm's individual Q-learning algorithm
        self.get_other_firm = simulation_run.getOtherFirm
        # ArrayLists to store its parameters over the time
        self.prices = deque(maxlen=self.examination_interval_size)
        self.quantities = deque(maxlen=self.examination_interval_size)
        self.profits = deque(maxlen=self.examination_interval_size)
        self.cournot = Cournot()

        logger.debug("Init completed: %s", self.__class__)
    def __init__(self):
        # Publish Q-matrix updates to Q-matrix topic
        self.q_matrix_pub = rospy.Publisher("/q_learning/QMatrix",
                                            QMatrix,
                                            queue_size=10)

        # Publish to /robot_action for phantom robot movement
        self.q_learning = QLearning()
        self.action_pub = rospy.Publisher("/q_learning/robot_action",
                                          RobotMoveDBToBlock,
                                          queue_size=10)

        # Subscribe to environment to receive reward updates
        self.reward = rospy.Subscriber("/q_learning/reward", QLearningReward,
                                       self.update_q_matrix)
        rospy.sleep(2)

        # Initialize Q-matrix
        self.q_matrix_msg = QMatrix()

        # For ease of use in training, we'll store the Q-matrix in a numpy array
        self.q_matrix_arr = np.zeros((64, 9))

        # Keep track of the 5 most recently updated Q-values
        self.q_history = []

        # Keeps track of current (starting) state, initialized to 0th state
        self.current_state = 0

        # Get new state and action from starting state
        new_state, action = self.get_random_action(self.current_state)

        # Current action is action from above (action from current state to new state)
        self.current_action = action

        # Keep track of new state
        self.new_state = new_state

        # Move the robot according to the current action
        self.move_DB(self.current_action
                     )  # update_q_matrix() will be called as callback
Esempio n. 15
0
    def __init__(self, total_episodes: int):
        self.window_width = constant.WIDTH * constant.TILE
        self.window_height = constant.HEIGHT * constant.TILE

        self._running = True
        self._display = None
        self._snake = None
        self._mouse = None

        self.episode = 1
        self.total_episodes = total_episodes
        self.score = 0
        self.max_score = 0
        self.frames = 0
        self.game_stats = []
        self.specs = []
        self.test_run = False

        self.snake = Snake()
        self.mouse = Mouse(constant.WIDTH, constant.HEIGHT,
                           self.snake.body_position())
        self.q = QLearning()
Esempio n. 16
0
def run_q_learning_forest(S, r1, r2):
    forest = WrappedForest(S, r1, r2)
    n_episodes = 10000
    how_often = n_episodes / 100

    stats = IterationStats('stats/ql_forest.csv', dims=2)

    def on_episode(episode, time, q_learner, q):
        forest.print_policy(print, q_learner.get_policy())
        stats.save_iteration(episode, time,
                             numpy.nanmean(numpy.nanmax(q, axis=0)), q)

    def is_done(state, action, next_state):
        if next_state.state_num == 0:
            return True
        return False

    gamma = 0.99
    start = time.time()
    numpy.random.seed(5263228)
    q_l = QLearning(forest,
                    0.5,
                    0.2,
                    gamma,
                    on_episode=on_episode,
                    start_at_0=True,
                    alpha=0.1,
                    is_done=is_done,
                    every_n_episode=how_often)
    stats.start_writing()
    q_l.run(n_episodes)
    stats.done_writing()
    forest.print_policy(print, q_l.get_policy())
    print('took {} s'.format(time.time() - start))

    stats = IterationStats('stats/ql_forest.csv', dims=2)
    analysis.create_iteration_value_graph(
        stats, 'average Q',
        'Average Q for each iteration on Forest Q Learning', 'forest_results')
    print(ob[0], ob[1])

num_bins = [10, 3]
num_pos_actions = len(actions)
print('num_pos_actions', num_pos_actions)

# hyperparams:
discount = 1.0
episodes = 100

epsilon = [0.5, 1.0,
           episodes // 2]  # Epsilon start, start decay index, stop decay index
lr = [0.5, 1.0, episodes // 2
      ]  # Learning rate start, start decay index, stop decay index

q_learning = QLearning(env, num_bins, num_pos_actions, env_ranges, discount,
                       episodes, epsilon, lr)

print('q-table shape', q_learning.q_table.shape)

obs = q_learning.reset_state()  # Reset the environment and get the initial
obs = [obs[i] for i in obs_to_use]
print('\nInitial observation:', obs)

action_to_maximise_q = q_learning.action_to_maximise_q(
    obs)  # Find optimal action
action = q_learning.decide_on_action(
    action_to_maximise_q)  # Decide whether to use optimal or random action
observation, reward_current, done = q_learning.perform_sim_step(
    action)  # env.step(action)  # Perform the first action

NUM_TO_SHOW = 5
TRAINING_ON = True
EPISODES = 1000
Q_MODEL_PATH = "outputs/keras-models/2048_q_model.h5"
Q_MODEL_WEIGHTS_PATH = "outputs/keras-models/2048_q_model_weights.h5"
T_MODEL_PATH = "outputs/keras-models/2048_t_model.h5"

board_size = int(math.sqrt(env.observation_space.shape[0]))
n_output = env.action_space.n

# In[3]:

QL = QLearning(n_x=board_size,
               n_y=n_output,
               q_save_path=Q_MODEL_PATH,
               q_weights_save_path=Q_MODEL_WEIGHTS_PATH,
               t_save_path=T_MODEL_PATH,
               total_episodes=EPISODES,
               restore_model=True,
               is_training_on=TRAINING_ON,
               T=10)

# In[ ]:

for episode in range(EPISODES):
    observation = env.reset()
    QL.curr_episode = episode

    while True:
        if RENDER_ENV: env.render()

        valid_move = False
Esempio n. 19
0
def main(argv):
    # Get command line arguments
    try:
        opts, args = getopt.getopt(argv, "pi:")
    except getopt.GetoptError:
        print("main.py [-p] [-i=n_iter]")
        sys.exit(1)
        
    # Plot switch
    plot = False
    
    # Default iterations
    n_iter = 100000
    
    # Parse command line arguments
    for opt, arg in opts:
        # Help
        if opt == "-h":
            print("main.py [-p] [-i=n_iter]")
            sys.exit()
        
        # Plot
        elif opt == "-p":
            plot = True
        
        # Number of iterations
        elif opt == "-i":
            n_iter = arg
            
    # Construct grid
    grid = construct_grid()
    
    # Find solution with value iteration
    print("Performing value iteration...")
    vi = ValueIteration(grid)
    vi = vi.solve()
    print("----------")
    print("The value for each cell is:")
    print("(x,y): value")
    for cell in grid:
        print(cell.name_x()+","+cell.name_y()+":     "+\
              str(vi[0][cell.get_name()]))
    print("----------")
    print("The policy found by value iteration is:")
    print("(x,y): action")
    for cell in grid:
        print(cell.name_x()+","+cell.name_y()+":     "+\
              str(vi[1][cell.get_name()]))
              
    # Find solution with Q-learning
    print("\n")
    print("Performing Q-learning...")
    ql = QLearning(grid, N_iter = int(n_iter))
    ql = ql.solve()
    ql_states = ql[0]
    ql_Q = ql[1]
    print("----------")
    
    print("The policy found by Q-learning is.")
    print("(x,y): action")
    
    actions = ["north","east","south","west"]
    for cell in grid:
        ind = ql_states.index(cell.get_name())
        action = actions[np.argmax(ql_Q[ind,])]
        print(cell.name_x()+","+cell.name_y()+":     "+str(action))
        
    if plot:
        # Convergence graph for q-learning: 
        # best Q-value of the best action for the START state 
        fig, ax = plt.subplots()
        ax.plot(ql[2][0:ql[3]], label = "Q-values for best action in START")
        ax.plot((0,ql[3]),(vi[0]["11"],vi[0]["11"]), label = "Values of START")
        plt.xlabel("Iterations")
        
    plt.show()
Esempio n. 20
0
from q_learning import QLearning
from SARSA import SARSALearning
from eligibility_traces import EligibilityTraces
from function_approximation import FApprox
from mountain_cart import run_methods, self_iterate
import pickle

if __name__ == "__main__":
    # Initialize a method
    methods = [
        QLearning("MountainCar-v0", print_progress=False),
        SARSALearning("MountainCar-v0", print_progress=False),
        FApprox("MountainCar-v0", print_progress=False),
        EligibilityTraces("MountainCar-v0", print_progress=False)
    ]

    # Run the tests
    run_methods(methods)

    method = methods[0]
    method.q_table = pickle.load(
        open("Best_Method_" + str(type(method).__name__) + ".p", "rb"))
    method.evaluate()
    method.display()

    self_iterate(methods[0])
Esempio n. 21
0
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 26 00:02:49 2018

@author: Shiratori
"""

import graphics as gfx
import pong_model as pm
from q_learning import QLearning

if __name__ == '__main__':
    # Set up environment
    environment = pm.PongModel(0.5, 0.5, -0.03, 0.01, 0.4, paddleX=0)
    window = gfx.GFX(wall_x=1, player_x=0)
    window.fps = 9e16
    
    # Set up model
    model = QLearning(environment, window, C=5e3, gamma=0.99, explore=-1, 
                      threshold=-1, log=True, log_file='q_test_log_(agent_B).txt', 
                      mode='test', q_table_file='q_q_table_(agent_B).csv')
    
    # Training
    model.train()
Esempio n. 22
0
            self.root_frame.q_learning_simulator = None
            pygame.display.quit()

    def draw_robot(self, i, j):
        top_left = (self.state_width * j, self.state_height * i)
        self.robot_rect = [
            top_left[0], top_left[1], self.state_width, self.state_height
        ]
        self.screen.blit(self.robot, self.robot_rect)

    def draw_world(self):
        for i in range(len(self.world)):
            for j in range(len(self.world[i])):
                self.draw_rectangle(i, j, self.world[i][j])

    def draw_rectangle(self, i, j, item):
        top_left = (self.state_width * j, self.state_height * i)
        color = self.item_colors[item]
        thickness = self.item_thickness[item]
        pygame.draw.rect(
            self.screen, color,
            (top_left[0], top_left[1], self.state_width, self.state_height),
            thickness)


if __name__ == '__main__':
    file_name = "./board.txt"
    q_learning = QLearning("./board.txt", is_stochastic=True)

    QLearningSimulator(q_learning, delay_time=100)
Esempio n. 23
0
def run():
    """execute the TraCI control loop"""
    step = 0

    # initialize QLearning
    num_phase = 2
    max_num_car_stopped = 10
    num_lane = 4
    num_wait_time_category = 10
    num_action = 10
    q = QLearning(num_phase, max_num_car_stopped, num_lane, num_action)

    # we start with phase 2 where EW has green
    #traci.trafficlight.setPhase("0", 2)
    while traci.simulation.getMinExpectedNumber() > 0:
        traci.simulationStep()

        #next_action_idx = 9

        # 現在の信号のフェーズ
        light_phase = traci.trafficlight.getPhase("0")

        # 現在のフェーズが黄色かつまだ次のアクションを決めていなかったら、次のフェーズの秒数を決める
        if (light_phase == 1
                or light_phase == 3) and not q.is_calculate_next_action:

            q.is_set_duration = False

            # 次に信号が取るフェーズを取得
            next_light_phase = 0
            if light_phase == 1:
                next_light_phase = 2

            # それぞれのレーンで停まっている車の数
            count_0 = min(traci.lanearea.getLastStepHaltingNumber("0"), 9)
            count_1 = min(traci.lanearea.getLastStepHaltingNumber("1"), 9)
            count_2 = min(traci.lanearea.getLastStepHaltingNumber("2"), 9)
            count_3 = min(traci.lanearea.getLastStepHaltingNumber("3"), 9)

            # 次の信号のフェーズと現在の混雑状況
            current_state_dict = {
                'light_phase': next_light_phase,
                'nums_car_stopped': [count_0, count_1, count_2, count_3]
            }

            current_digitized_state = q.digitize_state(current_state_dict)
            q.next_action_idx = q.get_action(current_digitized_state)
            q.is_calculate_next_action = True

            # reward
            reward = -np.sum(
                [x**1.5 for x in [count_0, count_1, count_2, count_3]])
            q.rewards.append(reward)

            # 各青赤フェーズが終了したタイミングで、以前の状況に対してとったアクションに対するリワードを計算するため、このタイミングで、前回のstateとactionに対するリワードを計算する?

            q.update_Qtable(q.previous_digitized_state, q.previous_action,
                            reward, current_digitized_state)

            q.previous_digitized_state = current_digitized_state
            q.previous_action_idx = q.next_action_idx

        # 現在のフェーズが0か2でかつまだ秒数をセットしていなかったら、秒数をセットする
        if (light_phase == 0 or light_phase == 2) and not q.is_set_duration:
            traci.trafficlight.setPhaseDuration("0",
                                                q.action[q.next_action_idx])
            q.is_set_duration = True
            q.is_calculate_next_action = False
            print("set phase {} for {} seconds".format(
                light_phase, q.action[q.next_action_idx]))

        step += 1
        if step % 10000 == 0:
            plot_graph(q.rewards)

    traci.close()
    sys.stdout.flush()
Esempio n. 24
0
            state_, reward, done = env.step(action)

            step_count += 1  # 增加步数

            # 机器人大脑从这个过渡(transition) (state, action, reward, state_) 中学习
            RL.learn(str(state), action, reward, str(state_))

            # 机器人移动到下一个 state
            state = state_

            # 如果踩到炸弹或者找到宝藏, 这回合就结束了
            if done:
                print("回合 {} 结束. 总步数 : {}\n".format(episode + 1, step_count))
                break

    # 结束游戏并关闭窗口
    print('游戏结束')
    env.destroy()


if __name__ == "__main__":
    # 创建环境 env 和 RL
    env = Maze()
    RL = QLearning(actions=list(range(env.n_actions)))

    # 开始可视化环境
    env.after(100, update)
    env.mainloop()

    print('\nQ 表:')
    print(RL.q_table)
Esempio n. 25
0
from q_learning import QLearning
from SARSA import SARSALearning
from ElgibilityTraces import ElgibilityTraces
from function_approximation import FApprox
import matplotlib.pyplot as plt
import numpy as np

if __name__ == "__main__":
    # Initialize a method
    methods = [
        QLearning("MountainCar-v0"),
        SARSALearning("MountainCar-v0"),
        FApprox("MountainCar-v0"),
        ElgibilityTraces("MountainCar-v0")
    ]
    method = methods[3]
    method.train(1000)

    # Test the method
    method.evaluate()
    method.plot()

    #for method in methods:
    #    graph = np.zeros(method.env._max_episode_steps / 10)
    ##    runs = 30
    #   for run in range(1, runs + 1):
    #       method.reset_model()
    #        method.train(1000)

    #       graph += method.convergence_graph / runs
Esempio n. 26
0
param_default_reward = 0
param_agent_loc = (0, 0)
param_alpha = 0.1
param_gamma = 0.99
param_epsilon = 0.9
hyper_n_episodes = 100

env = Env((param_grid_size, param_grid_size), param_agent_loc,
          (param_block_size, param_block_size), param_default_reward)
env.set_block((6, 6), param_default_reward, 100, True)
env.set_block((6, 5), param_default_reward, -100, True)
#env.set_block((6, 8), 100, 0, False)
#env.set_block((8, 8), 10,  0, False)
#env.set_block((6, 7), 10,  0, False)

q = QLearning(param_alpha, param_gamma, param_epsilon, Env.N_ACTIONS)
for episode in range(hyper_n_episodes):
    env.reset()
    is_terminal = False
    state = 0
    action = q.episode_start(state)
    while is_terminal == False:
        state, action, reward, state_next, is_terminal = env.step(action)
        print("agent from state {} --> state {}, take action {}".format(
            state, state_next, action))
        if is_terminal == True:
            print("episode {}: agent reached terminal".format(episode))
        action = q.step(state, action, reward, state_next)

        # show current state values
        action_values = q.action_values(state)
Esempio n. 27
0
env_ranges = [(-0.5, 0.5), (0, 1), (-0.5, 0.5), (-0.5, 0.5), (-0.4, 0.4),
              (-0.4, 0.4), (-1, 1), (-1, 1)]

print('\nObservation ranges:')
for ob in enumerate(env_ranges):
    print(ob[0], ob[1])

num_bins = [3, 20, 3, 6, 6, 6, 3, 3]
num_pos_actions = len(actions)

q_learning = QLearning(env=env,
                       num_bins=num_bins,
                       num_pos_actions=num_pos_actions,
                       env_ranges=env_ranges,
                       discount=0,
                       episodes=0,
                       epsilon=None,
                       lr=None,
                       USE=True)

env = gym.make('LunarLander-v2')
q_learning.q_table = np.load('./data_lunarlander/0_9000.npy')

for _ in range(10):

    obs = q_learning.reset_state()  # Reset the environment and get the initial

    done = False
    while not done:
Esempio n. 28
0
        time.sleep(1)


class GraphHolder:
    def __init__(self):
        self.graph = np.zeros(50)
        self.number = 100


threadList = []
nameList = []
graph = GraphHolder()

for i in range(graph.number):
    threadList.append("Thread-" + str(i))
    nameList.append(QLearning("MountainCar-v0"))

queueLock = threading.Lock()
workQueue = queue.Queue()
threads = []
threadID = 1

# Create new threads
for tName in threadList:
    thread = myThread(threadID, tName, workQueue, graph)
    thread.start()
    threads.append(thread)
    threadID += 1

# Fill the queue
queueLock.acquire()
Esempio n. 29
0
def run_q_learning_grid_world():
    world = GridWorld('simple_grid.txt', -0.01, include_treasure=False)
    n_episodes = 500000
    how_often = n_episodes / 500

    stats = IterationStats('stats/ql_simple_grid.csv', dims=5)

    def on_update(state, action, next_state, q_learner):
        #print('[{},{}] - {} -> [{},{}]'.format(state.x, state.y, action[0], next_state.x, next_state.y))
        pass

    def on_episode(episode, time, q_learner, q):
        world.print_policy(print, q_learner.get_policy())
        stats.save_iteration(episode, time,
                             numpy.nanmean(numpy.nanmax(q, axis=0)), q)
        #time.sleep(1)

    for state in world.get_states():
        if state.tile_type == GridWorldTile.GOAL:
            goal_state = state
            break

    def initialize_toward_goal(state: GridWorldTile):
        actions = state.get_actions()
        if len(actions) == 0:
            return []
        diff_x = goal_state.x - state.x
        diff_y = goal_state.y - state.y
        best_value = 0.1
        if len(actions) == 5 and actions[4][0].startswith('get treasure'):
            best_action = actions[4][0]
        elif abs(diff_x) >= abs(diff_y):
            if diff_x > 0:
                best_action = 'move east'
            else:
                best_action = 'move west'
        else:
            if diff_y < 0:
                best_action = 'move north'
            else:
                best_action = 'move south'
        values = [-0.1] * len(actions)
        for i, action in enumerate(actions):
            if action[0] == best_action:
                values[i] = best_value
        return values

    gamma = 0.99
    q_l = QLearning(world,
                    0.5,
                    0.05,
                    gamma,
                    on_update=on_update,
                    on_episode=on_episode,
                    initializer=initialize_toward_goal,
                    start_at_0=True,
                    alpha=0.1,
                    every_n_episode=how_often)
    stats.start_writing()
    q_l.run(n_episodes)
    stats.done_writing()
    world.print_policy(print, q_l.get_policy())
Esempio n. 30
0
import gym
import numpy as np
import torch
import sys
from matplotlib import pyplot as plt

sys.path.insert(0, "../../agents")

from q_learning import QLearning  # pylint: disable=import-error

EPISODES = 5000

if __name__ == "__main__":
    env = gym.make("FrozenLake-v0")
    agent = QLearning(env, epsilon=0.8, gamma=0.5, lr=0.01)

    episode_rew = []
    for episode in range(EPISODES):
        # Deciding first action
        action = env.action_space.sample()
        state = env.reset()
        ep_rew = 0
        while True:
            next_state, reward, done, _ = env.step(action)
            # env.render()
            ep_rew += reward

            agent.update((state, action, reward, next_state))
            state = next_state
            agent.get_action(state)