Ejemplo n.º 1
0
def main():

    # initialization of the grid
    grid = Grid()

    # creating markov decision and q learning classes for using algorithms
    mdp = MarkovDecision()
    ql = QLearning()

    print("\n")
    print("Value iteration results: \n")
    # parameter order: p, reward, discount
    utilities, policies = mdp.iterateValues(grid, 1, 0, 1)
    grid.printPolicies()

    grid.resetGrid()

    print("\n")
    print("Policy iteration results: \n")
    # parameter order: p, reward, discount
    mdp.iteratePolicies(grid, 1, 0, 1)
    grid.printPolicies()

    grid.resetGrid()

    print("\n")
    print("Q Learning results: \n")
    #parameter order: grid, p, reward, learning_rate, discount, epsilon, iteration, value_iteration results
    ql.learnQ(grid, 1, 0, 0.1, 1, 0, 1000, utilities, policies)
    grid.printQValues()
    grid.printQLearningResults()
Ejemplo n.º 2
0
    def __init__(self):

        # rospy.init_node('robot_action')

        # Initialize publishers and subscribers
        rospy.on_shutdown(self.shutdown)
        self.bridge = cv_bridge.CvBridge()
        self.action_sub = rospy.Subscriber('/q_learning/robot_action',
                                           RobotMoveDBToBlock,
                                           self.action_received)
        self.camera_sub = rospy.Subscriber('/camera/rgb/image_raw', Image,
                                           self.camera_received)
        self.scan_sub = rospy.Subscriber('/scan', LaserScan,
                                         self.scan_received)
        self.vel_pub = rospy.Publisher('cmd_vel', Twist, queue_size=1)
        self.action_pub = rospy.Publisher('/q_learning/robot_action',
                                          RobotMoveDBToBlock,
                                          queue_size=3)

        # Keras pipeline
        self.pipeline = keras_ocr.pipeline.Pipeline()

        # Variables for the scan and image results
        self.current_scan = []
        self.current_img = []

        # Import the Q-Learning class functions
        self.q_learning = QLearning()

        rospy.sleep(1)

        # Set color ranges for red, green, and blue
        self.color_info = {  #HSV upper and lower bounds for each color (need to test these ranges work)
            'red': {
                'lower': np.array([0, 250, 150]),
                'upper': np.array([10, 255, 180])
            },
            'blue': {
                'lower': np.array([110, 250, 150]),
                'upper': np.array([130, 255, 190])
            },
            'green': {
                'lower': np.array([45, 250, 150]),
                'upper': np.array([75, 255, 190])
            }
        }

        # Initialize robot arm components
        self.move_group_arm = moveit_commander.MoveGroupCommander("arm")
        self.move_group_gripper = moveit_commander.MoveGroupCommander(
            "gripper")

        # Set initial arm position
        self.move_group_arm.go([0.0, 0.9, -0.3, -0.79], wait=True)
        self.move_group_gripper.go([0.009, 0.009], wait=True)

        # Get the action sequence to maximize reward
        self.extract_action()
Ejemplo n.º 3
0
def main(argv):
    setpath()
    from q_learning import QLearning
    from grid_world import Grid

    # 5 rows, 4 cols, unreachables : [(1,1), (1,3)], pits : [(3,1)], goal : (4,3)
    g = Grid(5, 4, [(1, 1), (1, 3)], [(3, 1)], (4, 3))
    q = QLearning(g)
    q.learn()
Ejemplo n.º 4
0
 def assimilar(self, s, a, r, sn):
     QLearning.assimilar(self, s,a,r,sn)
     self.R[(s,a)] = r
     self.T[(s,a)] = sn
     for j in range(self.N):
         (s1, a1) = choice( self.Q.keys() )
         r1 = self.R[ (s1,a1)]
         sn1 = self.T[ (s1,a1)]
         QLearning.assimilar(self, s1,a1,r1,sn1)
Ejemplo n.º 5
0
def main(argv):
  setpath()
  from q_learning import QLearning
  from grid_world import Grid

  # 5 rows, 4 cols, unreachables : [(1,1), (1,3)], pits : [(3,1)], goal : (4,3)
  g = Grid(5, 4, [(1,1),(1,3)], [(3,1)], (4,3))
  q = QLearning(g)
  q.learn()
Ejemplo n.º 6
0
 def test_learning(self):
     qlearning = QLearning()
     old_state = self.empty_state()
     action = ACTION_SUICIDE
     new_state = self.empty_state()
     reward = 10
     old_reward = qlearning.q.get_q(old_state, action)
     qlearning.learn(old_state, new_state, action, reward)
     self.assertGreater(qlearning.q.get_q(old_state, action), old_reward)
Ejemplo n.º 7
0
def run_games(game_length, left_arm_mean, left_arm_std, n_players, right_arm_mean, right_arm_std, use_asrn, learning_rate = 0.01, gamma=0.95, epsilon=1.0, epsilon_decay=0.99):
    all_rewards = []
    all_goods = []
    all_losses = []
    all_q_tables = []
    trained_agent_q_values = [left_arm_mean / (1 - gamma), right_arm_mean / (1 - gamma)]
 
    for j in range(n_players):
        two_armed_bandit = BrokenArmedBandit(left_arm_mean=left_arm_mean, right_arm_mean=right_arm_mean, left_arm_std=left_arm_std, right_arm_std=right_arm_std)

        ## giving the real mean as initialization(!)
        left_initial_mean = trained_agent_q_values[0]
        right_initial_mean = trained_agent_q_values[1]

        q_learning = QLearning(left_initial_mean, right_initial_mean, learning_rate, gamma, epsilon, epsilon_decay)

        rewards = np.zeros((game_length, 1))
        goods = np.zeros((game_length, 1))
        losses = np.zeros((game_length, 1))
        q_table = []

        if use_asrn:
            asrn = BinsASRN(0, learning_period=game_length/10)
        for i in range(game_length):
            right, reward_estimation = q_learning.choose()
            good = q_learning.right_mean > q_learning.left_mean
            goods[i] = good

            q_table.append([q_learning.right_mean, q_learning.left_mean])
            
            reward = two_armed_bandit.pull(right)
            rewards[i] = reward

            if use_asrn:
                if right:
                    updated_right_mean = (1 - q_learning.learning_rate) * q_learning.right_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.right_mean)
                    reward = asrn.noise(q_learning.right_mean, updated_right_mean, reward)
                else:
                    updated_left_mean = (1 - q_learning.learning_rate) * q_learning.left_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.left_mean)
                    reward = asrn.noise(q_learning.left_mean, updated_left_mean, reward)

            loss = q_learning.update(right, reward)
            losses[i] = loss

        all_rewards.append(rewards)
        all_goods.append(goods)
        all_losses.append(losses)
        all_q_tables.append(q_table)

    return all_q_tables, all_rewards, all_goods, np.asarray(all_losses)
Ejemplo n.º 8
0
 def __init__(self, env, gamma, alpha):
     # sets self.env = env, state = None, next_waypoint = None, and a default color
     super(LearningAgent, self).__init__(env)
     self.color = 'red'  # override color
     # simple route planner to get next_waypoint
     self.planner = RoutePlanner(self.env, self)
     # Initialize additional variables and QLearning here
     self.actions = ['forward', 'left', 'right', None]
     self.states = []
     self.q_learn = QLearning(self.actions,
                              states=self.states,
                              gamma=gamma,
                              alpha=alpha)
     self.p_action = None
     self.p_reward = None
Ejemplo n.º 9
0
def greedy_policy():

    env = gym.make('FrozenLake-v0')
    q_learning = QLearning(env.action_space.n, env.observation_space.n)
    q_learning.set_general_state_action_values([0.5, 1, 0.5, 0.5])

    for i_episode in range(20):
        observation = env.reset()
        for t in range(100):
            env.render()
            print(observation)
            action = q_learning.greedy_policy(observation)
            observation, reward, done, info = env.step(action)
            if done:
                print "Episode finished after {} timesteps".format(t + 1)
                break
Ejemplo n.º 10
0
def main():
    env = Env()
    q_learning_agent = QLearning()
    if len(load_table_name) > 20:
        print(f'loading table {load_table_name}')
        q_learning_agent.load_table(load_table_name)
    print(env.check_car_pos())
    assert env.check_car_pos() == (269, 116)
    updater = QLearningUpdater(q_learning_agent)
    epsilon_greedy = EpsilonGreedyAgent(q_learning_agent)
    if learn:
        rewards = train_agent(env, epsilon_greedy, q_learning_agent, updater)
        np.save('q_l_tables/rewards.npy', rewards)
    else:
        print('exploiting')
        play_agent(env, q_learning_agent)
Ejemplo n.º 11
0
def create_q_learning_solver(labyrinth):
    # Setting up the q-learning R and Q matrixes
    q_learning_config_dict = {
    'labyrinth': labyrinth,
    'learning_rate': constant.STD_LEARNING_RATE,
    'exploration_chance': constant.STD_EXPLORATION_CHANCE
    }
    q_learning_solver = QLearning(q_learning_config_dict)
    return q_learning_solver
Ejemplo n.º 12
0
def q_learning_greedy_probability_policy():

    env = gym.make('FrozenLake-v0')
    q_learning = QLearning(env.action_space.n,
                           env.observation_space.n,
                           epsilon=0.1,
                           learning_rate=0.1)
    q_learning.set_general_state_action_values([0.5, 1, 0.5, 0.5])
    episode_rewards = []
    all_over_reward = 0.0
    for i_episode in range(7000):

        # We start a new episode with have to reset the environment and stats
        observation = env.reset()
        accumulated_reward = 0.0

        for t in range(100):

            # Show current state
            # env.render()

            # Choose action based on current experience
            action = q_learning.greedy_probability_policy(observation)

            # Save previous state, and commit action, resulting new current state
            previous_observation = observation
            observation, reward, done, info = env.step(action)

            # Accumulate more reward
            accumulated_reward += reward

            # Train algorithm based on new experience
            q_learning.update_state_action_function(previous_observation,
                                                    action, reward,
                                                    observation)

            #
            if done:
                print "Episode finished after {} timesteps".format(t + 1)
                print "Total reward for episode %i: %i" % (i_episode,
                                                           accumulated_reward)
                all_over_reward += accumulated_reward
                episode_rewards.append(accumulated_reward)
                break

    plot = Plot()
    plot.plot_evolution(episode_rewards)
    print q_learning.q_table
    q_learning.write_q_function_dump()
Ejemplo n.º 13
0
def run_tests():
    """ Runs all tests defined in task2_tests.yaml.

        Returns:
            results: Results dataframe.

        """
    with open(FILENAME) as file:
        # Loads testing parameters from the yaml file.
        tests = yaml.safe_load(file)

    # create a dataframe to keep the results
    test_dict = tests['Tests']
    results = pd.DataFrame(test_dict)
    results['Last Average Score'] = ""
    results['No of Q-Learning episodes'] = ""

    # run experiments:
    for i, test in enumerate(test_dict):
        grid = Rooms(test["env_size"], testing=True)
        learning = QLearning(grid, test["gamma"], test["alpha"],
                             test["agent_start_pos"])
        e_greedy = Policy("e-greedy", test["epsilon"], test["decay"])
        greedy = Policy(policy_type="greedy")
        experiment = Experiments(grid, learning, greedy, test["iters"],
                                 test["agent_start_pos"], test["test_no"])

        for session in range(test["iters"]):
            learning.run_multiple_episodes(test["batch_episodes"], e_greedy)
            mean_reward = experiment.run_experiments(test["exp_per_batch"])

        results.loc[i, 'Last Average Score'] = mean_reward
        results.loc[i,
                    'No of Q-Learning episodes'] = (session +
                                                    1) * test["batch_episodes"]

        # save results to csv file
        filename = 'results/' + 'test_table.csv'
        results.to_csv(filename)

        # plot & save graphs
        experiment.generate_results(test["test_no"], test)

    return results
Ejemplo n.º 14
0
def main(algorithm, track, x_start, y_start, discount, learning_rate, threshold, max_iterations, epsilon=None, reset_on_crash=False):
    """
    Program entry. Runs selected algorithm on selected track, at given coordinates, with given parameters
    :param algorithm: String
    :param track: List
    :param x_start: Int
    :param y_start: Int
    :param discount: Float
    :param learning_rate: Float
    :param threshold: Float
    :param max_iterations: Int
    :param epsilon: Float
    :param reset_on_crash: Boolean
    :return: None
    """
    with open(track) as f:
        specs = f.readline().strip().split(',')
        rows = int(specs[0])
        cols = int(specs[1])
        layout = f.read().splitlines()

        initial_state = (x_start, y_start, 0, 0)
        initial_action = (0, 0)

        agent = Car(initial_action, epsilon)
        environment = RaceTrack(rows, cols, layout, initial_state, reset_on_crash=reset_on_crash)

        if algorithm == 'value_iteration':
            value_iterator = ValueIteration(discount, threshold, max_iterations, environment, agent)
            value_iterator.run()
            path = value_iterator.extract_policy(initial_state)
            value_iterator.plot_max_diffs()
        elif algorithm == 'q_learning':
            q_learner = QLearning(discount, learning_rate, threshold, max_iterations, environment, agent)
            path = q_learner.run()
            q_learner.plot_avg_cost()
        elif algorithm == 'sarsa':
            sarsa = Sarsa(discount, learning_rate, threshold, max_iterations, environment, agent)
            path = sarsa.run()
            sarsa.plot_avg_cost()
        else:
            print("No algorithm selected")
            return None
        draw_track(path, layout)
    def __init__(self):
        # Publish Q-matrix updates to Q-matrix topic
        self.q_matrix_pub = rospy.Publisher("/RobotFinalProject/QMatrix",
                                            QMatrix,
                                            queue_size=10)

        # Publish to /robot_action for training the Q-matrices
        self.q_learning = QLearning()
        self.robot_action_pub = rospy.Publisher(
            "/RobotFinalProject/robot_action", RobotTasksDoors, queue_size=10)

        # Subscribe to environment to receive reward updates
        self.reward = rospy.Subscriber("/RobotFinalProject/reward",
                                       QLearningReward, self.update_q_matrix)
        rospy.sleep(2)

        # Initialize Q-matrices, one for each robot
        self.q_matrix_msg_a = QMatrix()
        self.q_matrix_msg_b = QMatrix()

        # For ease of use in training, we'll store the Q-matrices in a numpy array
        self.q_matrix_arr_a = np.zeros((128, 7))
        self.q_matrix_arr_b = np.zeros((128, 7))

        # Keep track of the 5 most recently updated Q-values per Q-matrix
        self.q_history_a = []
        self.q_history_b = []

        # Keeps track of current (starting) state, initialized to 0th state. This state accounts for both robots
        self.current_state = 0

        # Get new state and action from starting state
        new_state, action = self.get_random_action(self.current_state)

        # Current action is action from above (action from current state to new state)
        self.current_action = action

        # Keep track of new state
        self.new_state = new_state

        # Move the robot according to the current action
        self.complete_task_or_door(self.current_action)
Ejemplo n.º 16
0
    def __init__(self, simulation_run, seed: int):
        logger.debug("Start initializing this: %s", self.__class__)
        # self.SimulationManager = simulation_run.SimulationManager
        sim_manager = simulation_run.SimulationManager
        self.market_size = sim_manager.marketSize
        self.competition_type = sim_manager.competition_type
        self.examination_interval_size = sim_manager.sizeOfExaminationInterval

        self.price = 0.0
        self.quantity = 0.0
        self.profit = 0.0
        self.qlearning = QLearning(seed, sim_manager)                # holds the firm's individual Q-learning algorithm
        self.get_other_firm = simulation_run.getOtherFirm
        # ArrayLists to store its parameters over the time
        self.prices = deque(maxlen=self.examination_interval_size)
        self.quantities = deque(maxlen=self.examination_interval_size)
        self.profits = deque(maxlen=self.examination_interval_size)
        self.cournot = Cournot()

        logger.debug("Init completed: %s", self.__class__)
    def __init__(self):
        # Publish Q-matrix updates to Q-matrix topic
        self.q_matrix_pub = rospy.Publisher("/q_learning/QMatrix",
                                            QMatrix,
                                            queue_size=10)

        # Publish to /robot_action for phantom robot movement
        self.q_learning = QLearning()
        self.action_pub = rospy.Publisher("/q_learning/robot_action",
                                          RobotMoveDBToBlock,
                                          queue_size=10)

        # Subscribe to environment to receive reward updates
        self.reward = rospy.Subscriber("/q_learning/reward", QLearningReward,
                                       self.update_q_matrix)
        rospy.sleep(2)

        # Initialize Q-matrix
        self.q_matrix_msg = QMatrix()

        # For ease of use in training, we'll store the Q-matrix in a numpy array
        self.q_matrix_arr = np.zeros((64, 9))

        # Keep track of the 5 most recently updated Q-values
        self.q_history = []

        # Keeps track of current (starting) state, initialized to 0th state
        self.current_state = 0

        # Get new state and action from starting state
        new_state, action = self.get_random_action(self.current_state)

        # Current action is action from above (action from current state to new state)
        self.current_action = action

        # Keep track of new state
        self.new_state = new_state

        # Move the robot according to the current action
        self.move_DB(self.current_action
                     )  # update_q_matrix() will be called as callback
Ejemplo n.º 18
0
    def __init__(self, total_episodes: int):
        self.window_width = constant.WIDTH * constant.TILE
        self.window_height = constant.HEIGHT * constant.TILE

        self._running = True
        self._display = None
        self._snake = None
        self._mouse = None

        self.episode = 1
        self.total_episodes = total_episodes
        self.score = 0
        self.max_score = 0
        self.frames = 0
        self.game_stats = []
        self.specs = []
        self.test_run = False

        self.snake = Snake()
        self.mouse = Mouse(constant.WIDTH, constant.HEIGHT,
                           self.snake.body_position())
        self.q = QLearning()
Ejemplo n.º 19
0
def run_q_learning_forest(S, r1, r2):
    forest = WrappedForest(S, r1, r2)
    n_episodes = 10000
    how_often = n_episodes / 100

    stats = IterationStats('stats/ql_forest.csv', dims=2)

    def on_episode(episode, time, q_learner, q):
        forest.print_policy(print, q_learner.get_policy())
        stats.save_iteration(episode, time,
                             numpy.nanmean(numpy.nanmax(q, axis=0)), q)

    def is_done(state, action, next_state):
        if next_state.state_num == 0:
            return True
        return False

    gamma = 0.99
    start = time.time()
    numpy.random.seed(5263228)
    q_l = QLearning(forest,
                    0.5,
                    0.2,
                    gamma,
                    on_episode=on_episode,
                    start_at_0=True,
                    alpha=0.1,
                    is_done=is_done,
                    every_n_episode=how_often)
    stats.start_writing()
    q_l.run(n_episodes)
    stats.done_writing()
    forest.print_policy(print, q_l.get_policy())
    print('took {} s'.format(time.time() - start))

    stats = IterationStats('stats/ql_forest.csv', dims=2)
    analysis.create_iteration_value_graph(
        stats, 'average Q',
        'Average Q for each iteration on Forest Q Learning', 'forest_results')
Ejemplo n.º 20
0
env_ranges = [(-0.5, 0.5), (0, 1), (-0.5, 0.5), (-0.5, 0.5), (-0.4, 0.4),
              (-0.4, 0.4), (-1, 1), (-1, 1)]

print('\nObservation ranges:')
for ob in enumerate(env_ranges):
    print(ob[0], ob[1])

num_bins = [3, 20, 3, 6, 6, 6, 3, 3]
num_pos_actions = len(actions)

q_learning = QLearning(env=env,
                       num_bins=num_bins,
                       num_pos_actions=num_pos_actions,
                       env_ranges=env_ranges,
                       discount=0,
                       episodes=0,
                       epsilon=None,
                       lr=None,
                       USE=True)

env = gym.make('LunarLander-v2')
q_learning.q_table = np.load('./data_lunarlander/0_9000.npy')

for _ in range(10):

    obs = q_learning.reset_state()  # Reset the environment and get the initial

    done = False
    while not done:
Ejemplo n.º 21
0
from q_learning import QLearning
from SARSA import SARSALearning
from eligibility_traces import EligibilityTraces
from function_approximation import FApprox
from mountain_cart import run_methods, self_iterate
import pickle

if __name__ == "__main__":
    # Initialize a method
    methods = [
        QLearning("MountainCar-v0", print_progress=False),
        SARSALearning("MountainCar-v0", print_progress=False),
        FApprox("MountainCar-v0", print_progress=False),
        EligibilityTraces("MountainCar-v0", print_progress=False)
    ]

    # Run the tests
    run_methods(methods)

    method = methods[0]
    method.q_table = pickle.load(
        open("Best_Method_" + str(type(method).__name__) + ".p", "rb"))
    method.evaluate()
    method.display()

    self_iterate(methods[0])
Ejemplo n.º 22
0
from q_learning import QLearning
from SARSA import SARSALearning
from ElgibilityTraces import ElgibilityTraces
from function_approximation import FApprox
import matplotlib.pyplot as plt
import numpy as np

if __name__ == "__main__":
    # Initialize a method
    methods = [
        QLearning("MountainCar-v0"),
        SARSALearning("MountainCar-v0"),
        FApprox("MountainCar-v0"),
        ElgibilityTraces("MountainCar-v0")
    ]
    method = methods[3]
    method.train(1000)

    # Test the method
    method.evaluate()
    method.plot()

    #for method in methods:
    #    graph = np.zeros(method.env._max_episode_steps / 10)
    ##    runs = 30
    #   for run in range(1, runs + 1):
    #       method.reset_model()
    #        method.train(1000)

    #       graph += method.convergence_graph / runs
Ejemplo n.º 23
0
            state_, reward, done = env.step(action)

            step_count += 1  # 增加步数

            # 机器人大脑从这个过渡(transition) (state, action, reward, state_) 中学习
            RL.learn(str(state), action, reward, str(state_))

            # 机器人移动到下一个 state
            state = state_

            # 如果踩到炸弹或者找到宝藏, 这回合就结束了
            if done:
                print("回合 {} 结束. 总步数 : {}\n".format(episode + 1, step_count))
                break

    # 结束游戏并关闭窗口
    print('游戏结束')
    env.destroy()


if __name__ == "__main__":
    # 创建环境 env 和 RL
    env = Maze()
    RL = QLearning(actions=list(range(env.n_actions)))

    # 开始可视化环境
    env.after(100, update)
    env.mainloop()

    print('\nQ 表:')
    print(RL.q_table)
Ejemplo n.º 24
0
 def __init__(self):
     self.qlearning = QLearning()
Ejemplo n.º 25
0
            self.root_frame.q_learning_simulator = None
            pygame.display.quit()

    def draw_robot(self, i, j):
        top_left = (self.state_width * j, self.state_height * i)
        self.robot_rect = [
            top_left[0], top_left[1], self.state_width, self.state_height
        ]
        self.screen.blit(self.robot, self.robot_rect)

    def draw_world(self):
        for i in range(len(self.world)):
            for j in range(len(self.world[i])):
                self.draw_rectangle(i, j, self.world[i][j])

    def draw_rectangle(self, i, j, item):
        top_left = (self.state_width * j, self.state_height * i)
        color = self.item_colors[item]
        thickness = self.item_thickness[item]
        pygame.draw.rect(
            self.screen, color,
            (top_left[0], top_left[1], self.state_width, self.state_height),
            thickness)


if __name__ == '__main__':
    file_name = "./board.txt"
    q_learning = QLearning("./board.txt", is_stochastic=True)

    QLearningSimulator(q_learning, delay_time=100)
Ejemplo n.º 26
0
    def __init__(self): # ** deveriamos passar os parametros psa para anular as dependencais
        self.N = 50
        self.R = {}
        self.T = {}

        QLearning.__init__( self)
    print(ob[0], ob[1])

num_bins = [10, 3]
num_pos_actions = len(actions)
print('num_pos_actions', num_pos_actions)

# hyperparams:
discount = 1.0
episodes = 100

epsilon = [0.5, 1.0,
           episodes // 2]  # Epsilon start, start decay index, stop decay index
lr = [0.5, 1.0, episodes // 2
      ]  # Learning rate start, start decay index, stop decay index

q_learning = QLearning(env, num_bins, num_pos_actions, env_ranges, discount,
                       episodes, epsilon, lr)

print('q-table shape', q_learning.q_table.shape)

obs = q_learning.reset_state()  # Reset the environment and get the initial
obs = [obs[i] for i in obs_to_use]
print('\nInitial observation:', obs)

action_to_maximise_q = q_learning.action_to_maximise_q(
    obs)  # Find optimal action
action = q_learning.decide_on_action(
    action_to_maximise_q)  # Decide whether to use optimal or random action
observation, reward_current, done = q_learning.perform_sim_step(
    action)  # env.step(action)  # Perform the first action

NUM_TO_SHOW = 5
Ejemplo n.º 28
0
class Robot:
    game = None
    last_game = None

    current_state = None
    last_states = {}
    last_action = {}

    delta = None

    # Keep track of all our robot_ids. Will be useful to detect new robots.
    robot_ids = set()

    # default values - will get overridden by rgkit
    location = (0, 0)
    hp = 0
    player_id = 0
    robot_id = 0

    def __init__(self):
        self.qlearning = QLearning()

    def act(self, game):
        new_robot = self.robot_id in self.robot_ids
        self.robot_ids.add(self.robot_id)

        self.current_state = State.from_game(game, self.player_id,
                                             robot_loc=self.location)
        self.game = game

        # Explore function
        if random.randint(0, 3) < 1:
            print("[Bot " + str(self.robot_id) + "] random action")
            # print self.state
            action = self.get_random_action()
        else:
            action = self.qlearning.predict(self.current_state)

        self.last_states[self.robot_id] = self.current_state
        self.last_action[self.robot_id] = action

        return State.map_action(action, self.location)

    def get_possible_actions(self):
        possible_moves = [s.ACTION_SUICIDE]
        possible_locs = rg.locs_around(self.location,
                                       filter_out=('invalid', 'obstacle'))
        for loc in possible_locs:
            possible_moves.append((s.ACTION_MOVE, loc))
            possible_moves.append((s.ACTION_ATTACK, loc))
        return possible_moves

    def get_random_action(self):
        possible_action = self.get_possible_actions()
        return random.choice(possible_action)

    # delta = [AttrDict{
    #    'loc': loc,
    #    'hp': hp,
    #    'player_id': player_id,
    #    'loc_end': loc_end,
    #    'hp_end': hp_end
    # }]
    # returns new GameState

    def learn(self):
        my_delta = [d for d in self.delta if d.loc == self.location]
        last_state = self.last_states[self.robot_id]
        action = self.last_action[self.robot_id]
        future_state = self.current_state
        reward = self.reward(my_delta)
        self.qlearning.learn(last_state, future_state, action, reward)

    @staticmethod
    def reward(my_delta):
        damage_taken = my_delta.hp - my_delta.hp_end
        reward = my_delta.damage_caused - damage_taken
        return reward

    def delta_callback(self, delta, actions, new_gamestate):
        future_game = new_gamestate.get_game_info(self.player_id)
        print "delta_callback calle"
        print("Size of Q: " + str(len(self.qlearning.q.hash_matrix)))
        for (robot_loc, robot) in self.game.robots.items():
            if hasattr(robot, 'robot_id') and robot.robot_id in self.robot_ids:
                action = self.last_action[robot.robot_id]

                for delta_me in delta:
                    state_template = State.from_game(future_game,
                                                     self.player_id)
                    if delta_me['loc'] == robot_loc:
                        future_state = copy.deepcopy(state_template)
                        future_state.robot_loc = delta_me.loc_end
                        reward = self.reward(delta_me)
                        self.qlearning.learn(self.current_state, future_state,
                                             action, reward)
Ejemplo n.º 29
0
 def test_pickle(self):
     qlearning = QLearning()
     qlearning.save("test_q.pickle")
     qlearning_load = QLearning.load("test_q.pickle")
     assert qlearning == qlearning_load
Ejemplo n.º 30
0
def main(argv):
    # Get command line arguments
    try:
        opts, args = getopt.getopt(argv, "pi:")
    except getopt.GetoptError:
        print("main.py [-p] [-i=n_iter]")
        sys.exit(1)
        
    # Plot switch
    plot = False
    
    # Default iterations
    n_iter = 100000
    
    # Parse command line arguments
    for opt, arg in opts:
        # Help
        if opt == "-h":
            print("main.py [-p] [-i=n_iter]")
            sys.exit()
        
        # Plot
        elif opt == "-p":
            plot = True
        
        # Number of iterations
        elif opt == "-i":
            n_iter = arg
            
    # Construct grid
    grid = construct_grid()
    
    # Find solution with value iteration
    print("Performing value iteration...")
    vi = ValueIteration(grid)
    vi = vi.solve()
    print("----------")
    print("The value for each cell is:")
    print("(x,y): value")
    for cell in grid:
        print(cell.name_x()+","+cell.name_y()+":     "+\
              str(vi[0][cell.get_name()]))
    print("----------")
    print("The policy found by value iteration is:")
    print("(x,y): action")
    for cell in grid:
        print(cell.name_x()+","+cell.name_y()+":     "+\
              str(vi[1][cell.get_name()]))
              
    # Find solution with Q-learning
    print("\n")
    print("Performing Q-learning...")
    ql = QLearning(grid, N_iter = int(n_iter))
    ql = ql.solve()
    ql_states = ql[0]
    ql_Q = ql[1]
    print("----------")
    
    print("The policy found by Q-learning is.")
    print("(x,y): action")
    
    actions = ["north","east","south","west"]
    for cell in grid:
        ind = ql_states.index(cell.get_name())
        action = actions[np.argmax(ql_Q[ind,])]
        print(cell.name_x()+","+cell.name_y()+":     "+str(action))
        
    if plot:
        # Convergence graph for q-learning: 
        # best Q-value of the best action for the START state 
        fig, ax = plt.subplots()
        ax.plot(ql[2][0:ql[3]], label = "Q-values for best action in START")
        ax.plot((0,ql[3]),(vi[0]["11"],vi[0]["11"]), label = "Values of START")
        plt.xlabel("Iterations")
        
    plt.show()
Ejemplo n.º 31
0
        time.sleep(1)


class GraphHolder:
    def __init__(self):
        self.graph = np.zeros(50)
        self.number = 100


threadList = []
nameList = []
graph = GraphHolder()

for i in range(graph.number):
    threadList.append("Thread-" + str(i))
    nameList.append(QLearning("MountainCar-v0"))

queueLock = threading.Lock()
workQueue = queue.Queue()
threads = []
threadID = 1

# Create new threads
for tName in threadList:
    thread = myThread(threadID, tName, workQueue, graph)
    thread.start()
    threads.append(thread)
    threadID += 1

# Fill the queue
queueLock.acquire()
Ejemplo n.º 32
0
class RobotAction(object):
    def __init__(self):

        # rospy.init_node('robot_action')

        # Initialize publishers and subscribers
        rospy.on_shutdown(self.shutdown)
        self.bridge = cv_bridge.CvBridge()
        self.action_sub = rospy.Subscriber('/q_learning/robot_action',
                                           RobotMoveDBToBlock,
                                           self.action_received)
        self.camera_sub = rospy.Subscriber('/camera/rgb/image_raw', Image,
                                           self.camera_received)
        self.scan_sub = rospy.Subscriber('/scan', LaserScan,
                                         self.scan_received)
        self.vel_pub = rospy.Publisher('cmd_vel', Twist, queue_size=1)
        self.action_pub = rospy.Publisher('/q_learning/robot_action',
                                          RobotMoveDBToBlock,
                                          queue_size=3)

        # Keras pipeline
        self.pipeline = keras_ocr.pipeline.Pipeline()

        # Variables for the scan and image results
        self.current_scan = []
        self.current_img = []

        # Import the Q-Learning class functions
        self.q_learning = QLearning()

        rospy.sleep(1)

        # Set color ranges for red, green, and blue
        self.color_info = {  #HSV upper and lower bounds for each color (need to test these ranges work)
            'red': {
                'lower': np.array([0, 250, 150]),
                'upper': np.array([10, 255, 180])
            },
            'blue': {
                'lower': np.array([110, 250, 150]),
                'upper': np.array([130, 255, 190])
            },
            'green': {
                'lower': np.array([45, 250, 150]),
                'upper': np.array([75, 255, 190])
            }
        }

        # Initialize robot arm components
        self.move_group_arm = moveit_commander.MoveGroupCommander("arm")
        self.move_group_gripper = moveit_commander.MoveGroupCommander(
            "gripper")

        # Set initial arm position
        self.move_group_arm.go([0.0, 0.9, -0.3, -0.79], wait=True)
        self.move_group_gripper.go([0.009, 0.009], wait=True)

        # Get the action sequence to maximize reward
        self.extract_action()

    # Extract 3 maximum reward actions from the Q-matrix
    def extract_action(self):

        # Track the number of the actions maximizing reward
        actions = []

        # Read in the Q-matrix
        q_matrix_arr = self.q_learning.read_q_matrix()

        # Set the starting state to 0
        states = self.q_learning.states
        state = 0

        # Iterate 3 times since the Q-matrix only produces a sequence of 3 consecutive actions
        for i in range(3):
            max_reward = -1
            best_action = 0

            # Find the best action for the current state based on its Q-value and save it
            for (action, reward) in enumerate(q_matrix_arr[state]):
                if reward > max_reward:
                    max_reward = reward
                    best_action = action
            actions.append(best_action)

            # Find the next state that this action leads to from the current state
            next_state = 0
            for (s, a) in enumerate(self.q_learning.action_matrix[state]):
                if int(a) == best_action:
                    next_state = s
                    break
            state = next_state

        # Convert the action numbers into action messages
        action_msgs = []
        for a in actions:
            #construct action message
            msg = RobotMoveDBToBlock()
            msg.block_id = self.q_learning.actions[a]['block']
            msg.robot_db = self.q_learning.actions[a]['dumbbell']
            action_msgs.append(msg)

        # Publish and queue up the actions
        while len(action_msgs) > 0:
            msg = action_msgs.pop(0)
            self.action_pub.publish(msg)

    # Callback function for when an action is received
    def action_received(self, data):
        # Initialize color and block
        color = data.robot_db
        block = data.block_id

        # Scan around for appropriate dumbbell
        self.locate_dumbell(color)

        # Move to dumbbell
        self.move_to_dumbell(color)

        # Lift dumbbell
        self.lift_dumbbell()

        # Move it to the corresponding block
        self.move_to_block(block)

        # Put down the dumbbell
        self.drop_dumbbell()

    # Save the camera feed and image
    def camera_received(self, data):
        image = self.bridge.imgmsg_to_cv2(data, desired_encoding='bgr8')
        image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)  #convert to HSV

        self.current_img = image

    def locate_dumbell(self,
                       color):  #spin around until dumbell color is in sight
        found = False
        msg = Twist()
        msg.angular.z = np.pi / 12.0  #turn until color appears in camera
        upper, lower = self.color_info[color]['upper'], self.color_info[color][
            'lower']
        while not found:
            mask = cv2.inRange(self.current_img, lower,
                               upper)  #code from line follower to detect color
            w, h = mask.shape
            mask = mask[w // 3:2 * w // 3, h // 3:2 * h // 3]
            # print(np.sum(mask),np.sum(self.current_img))
            if np.sum(mask) > 0:  #if this is true, then color detected
                found = True
                self.vel_pub.publish(Twist())

            else:
                self.vel_pub.publish(msg)

    def move_to_dumbell(self, color):

        upper, lower = self.color_info[color]['upper'], self.color_info[color][
            'lower']
        front_dist = np.inf
        stop_dist = 0.20

        while front_dist > stop_dist:  #TODO: stop when close enough to dumbell

            mask = cv2.inRange(self.current_img, lower, upper)
            twist = Twist()
            M = cv2.moments(mask)
            h, w, d = self.current_img.shape
            img = cv2.cvtColor(self.current_img, cv2.COLOR_HSV2BGR)

            if M['m00'] > 0:

                cx = int(M['m10'] / M['m00'])
                cy = int(M['m01'] / M['m00'])
                # print(cx,cy)

                err = w / 2 - cx
                if err < 100:
                    front_dist = self.current_scan.ranges[
                        0]  #only adjust linear when aligned to dumbell
                k_p = 1.0 / 100.0
                k_lin = 1.0
                twist.linear.x = k_lin * min(0.5, max(0,
                                                      front_dist - stop_dist))
                twist.angular.z = k_p * err
                self.vel_pub.publish(twist)

    def move_to_block(self, block):

        dig_map = {  #commonly returned labels for digits
            1: {'1', 'i', 'l'},
            2: {'2', 's'},
            3: {'3', 't', '5'}
        }
        found = False  #first turn until facing the correct block
        msg = Twist()
        msg.angular.z = -1.0 * np.pi / 8.0  #turn until color appears in camera
        while not found:
            pred = self.pipeline.recognize([self.current_img
                                            ])[0]  #get object predictions
            for (label, bb) in pred:
                print(label, bb)
                if label in dig_map[block]:  #digit dectected
                    found = True
            if not found:  #if not found, turn robot to get new view
                r = rospy.Rate(2)
                for _ in range(4):
                    self.vel_pub.publish(msg)
                    r.sleep()
                self.vel_pub.publish(Twist())
        self.vel_pub.publish(Twist())

        front_dist = np.inf
        stop_dist = 0.35

        while front_dist > stop_dist:  #now that block is in frame, move to block

            msg = Twist()

            h, w, d = self.current_img.shape

            correct_box_idx = 0
            max_area = 0
            pred = self.pipeline.recognize([self.current_img])[0]
            if pred:
                for i in range(len(pred)):
                    label, bb = pred[i][0], pred[i][1]
                    if label in dig_map[block]:
                        area = (bb[0][0] - bb[2][0]) * (bb[1][1] - bb[3][1])
                        if area > max_area:  #make sure we're looking at the right face (will have largest area)
                            max_area = area
                            correct_box_idx = i
                bbox = pred[correct_box_idx][1]
                cx = np.mean(bbox[:, 0])  #this will get center of bounding box

                # print(bbox, pred[correct_box_idx][0],cx,correct_box_idx)

                err = w / 2 - cx  #offset from bounding box center
                if err < 100:
                    front_dist = self.current_scan.ranges[0]

                k_p = 1.0 / 400.0
                k_lin = 0.5
                msg.linear.x = k_lin * min(0.5, max(
                    0, front_dist - stop_dist))  #proportional control
                msg.angular.z = k_p * err
                r = rospy.Rate(2)
                for _ in range(2):
                    self.vel_pub.publish(msg)
                    r.sleep()
                self.vel_pub.publish(Twist())
            else:
                break

    def lift_dumbbell(self):
        # Close gripper around dumbbell
        gripper_joint_goal = [0.006, 0.006]
        self.move_group_gripper.go(gripper_joint_goal, wait=True)
        self.move_group_gripper.stop()

        # Lift arm up
        arm_joint_goal = [0.0, 0.05, -0.15, -0.79]
        self.move_group_arm.go(arm_joint_goal, wait=True)
        # Calling ``stop()`` ensures that there is no residual movement
        self.move_group_arm.stop()

    def drop_dumbbell(self):
        # Drop arm down
        arm_joint_goal = [0.0, 0.9, -0.3, -0.79]
        self.move_group_arm.go(arm_joint_goal, wait=True)
        # Calling ``stop()`` ensures that there is no residual movement
        self.move_group_arm.stop()

        # Open gripper around dumbbell
        gripper_joint_goal = [0.009, 0.009]
        self.move_group_gripper.go(gripper_joint_goal, wait=True)
        self.move_group_gripper.stop()

        # Move backwards away from the dumbbell so robot can rotate
        msg = Twist()
        msg.linear.x = -1
        self.vel_pub.publish(msg)
        rospy.sleep(1)
        self.vel_pub.publish(Twist())

    # Sets the current_scan variable to the incoming scan data
    def scan_received(self, data):
        self.current_scan = data

    # Runs the program
    def run(self):
        rospy.spin()

    # Shuts down the program
    def shutdown(self):
        self.vel_pub.publish(Twist())
Ejemplo n.º 33
0
import gym
import numpy as np
import torch
import sys
from matplotlib import pyplot as plt

sys.path.insert(0, "../../agents")

from q_learning import QLearning  # pylint: disable=import-error

EPISODES = 5000

if __name__ == "__main__":
    env = gym.make("FrozenLake-v0")
    agent = QLearning(env, epsilon=0.8, gamma=0.5, lr=0.01)

    episode_rew = []
    for episode in range(EPISODES):
        # Deciding first action
        action = env.action_space.sample()
        state = env.reset()
        ep_rew = 0
        while True:
            next_state, reward, done, _ = env.step(action)
            # env.render()
            ep_rew += reward

            agent.update((state, action, reward, next_state))
            state = next_state
            agent.get_action(state)
Ejemplo n.º 34
0
def run_q_learning_grid_world():
    world = GridWorld('simple_grid.txt', -0.01, include_treasure=False)
    n_episodes = 500000
    how_often = n_episodes / 500

    stats = IterationStats('stats/ql_simple_grid.csv', dims=5)

    def on_update(state, action, next_state, q_learner):
        #print('[{},{}] - {} -> [{},{}]'.format(state.x, state.y, action[0], next_state.x, next_state.y))
        pass

    def on_episode(episode, time, q_learner, q):
        world.print_policy(print, q_learner.get_policy())
        stats.save_iteration(episode, time,
                             numpy.nanmean(numpy.nanmax(q, axis=0)), q)
        #time.sleep(1)

    for state in world.get_states():
        if state.tile_type == GridWorldTile.GOAL:
            goal_state = state
            break

    def initialize_toward_goal(state: GridWorldTile):
        actions = state.get_actions()
        if len(actions) == 0:
            return []
        diff_x = goal_state.x - state.x
        diff_y = goal_state.y - state.y
        best_value = 0.1
        if len(actions) == 5 and actions[4][0].startswith('get treasure'):
            best_action = actions[4][0]
        elif abs(diff_x) >= abs(diff_y):
            if diff_x > 0:
                best_action = 'move east'
            else:
                best_action = 'move west'
        else:
            if diff_y < 0:
                best_action = 'move north'
            else:
                best_action = 'move south'
        values = [-0.1] * len(actions)
        for i, action in enumerate(actions):
            if action[0] == best_action:
                values[i] = best_value
        return values

    gamma = 0.99
    q_l = QLearning(world,
                    0.5,
                    0.05,
                    gamma,
                    on_update=on_update,
                    on_episode=on_episode,
                    initializer=initialize_toward_goal,
                    start_at_0=True,
                    alpha=0.1,
                    every_n_episode=how_often)
    stats.start_writing()
    q_l.run(n_episodes)
    stats.done_writing()
    world.print_policy(print, q_l.get_policy())
Ejemplo n.º 35
0
TRAINING_ON = True
EPISODES = 1000
Q_MODEL_PATH = "outputs/keras-models/2048_q_model.h5"
Q_MODEL_WEIGHTS_PATH = "outputs/keras-models/2048_q_model_weights.h5"
T_MODEL_PATH = "outputs/keras-models/2048_t_model.h5"

board_size = int(math.sqrt(env.observation_space.shape[0]))
n_output = env.action_space.n

# In[3]:

QL = QLearning(n_x=board_size,
               n_y=n_output,
               q_save_path=Q_MODEL_PATH,
               q_weights_save_path=Q_MODEL_WEIGHTS_PATH,
               t_save_path=T_MODEL_PATH,
               total_episodes=EPISODES,
               restore_model=True,
               is_training_on=TRAINING_ON,
               T=10)

# In[ ]:

for episode in range(EPISODES):
    observation = env.reset()
    QL.curr_episode = episode

    while True:
        if RENDER_ENV: env.render()

        valid_move = False
Ejemplo n.º 36
0
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 26 00:02:49 2018

@author: Shiratori
"""

import graphics as gfx
import pong_model as pm
from q_learning import QLearning

if __name__ == '__main__':
    # Set up environment
    environment = pm.PongModel(0.5, 0.5, -0.03, 0.01, 0.4, paddleX=0)
    window = gfx.GFX(wall_x=1, player_x=0)
    window.fps = 9e16
    
    # Set up model
    model = QLearning(environment, window, C=5e3, gamma=0.99, explore=-1, 
                      threshold=-1, log=True, log_file='q_test_log_(agent_B).txt', 
                      mode='test', q_table_file='q_q_table_(agent_B).csv')
    
    # Training
    model.train()
Ejemplo n.º 37
0
def run():
    """execute the TraCI control loop"""
    step = 0

    # initialize QLearning
    num_phase = 2
    max_num_car_stopped = 10
    num_lane = 4
    num_wait_time_category = 10
    num_action = 10
    q = QLearning(num_phase, max_num_car_stopped, num_lane, num_action)

    # we start with phase 2 where EW has green
    #traci.trafficlight.setPhase("0", 2)
    while traci.simulation.getMinExpectedNumber() > 0:
        traci.simulationStep()

        #next_action_idx = 9

        # 現在の信号のフェーズ
        light_phase = traci.trafficlight.getPhase("0")

        # 現在のフェーズが黄色かつまだ次のアクションを決めていなかったら、次のフェーズの秒数を決める
        if (light_phase == 1
                or light_phase == 3) and not q.is_calculate_next_action:

            q.is_set_duration = False

            # 次に信号が取るフェーズを取得
            next_light_phase = 0
            if light_phase == 1:
                next_light_phase = 2

            # それぞれのレーンで停まっている車の数
            count_0 = min(traci.lanearea.getLastStepHaltingNumber("0"), 9)
            count_1 = min(traci.lanearea.getLastStepHaltingNumber("1"), 9)
            count_2 = min(traci.lanearea.getLastStepHaltingNumber("2"), 9)
            count_3 = min(traci.lanearea.getLastStepHaltingNumber("3"), 9)

            # 次の信号のフェーズと現在の混雑状況
            current_state_dict = {
                'light_phase': next_light_phase,
                'nums_car_stopped': [count_0, count_1, count_2, count_3]
            }

            current_digitized_state = q.digitize_state(current_state_dict)
            q.next_action_idx = q.get_action(current_digitized_state)
            q.is_calculate_next_action = True

            # reward
            reward = -np.sum(
                [x**1.5 for x in [count_0, count_1, count_2, count_3]])
            q.rewards.append(reward)

            # 各青赤フェーズが終了したタイミングで、以前の状況に対してとったアクションに対するリワードを計算するため、このタイミングで、前回のstateとactionに対するリワードを計算する?

            q.update_Qtable(q.previous_digitized_state, q.previous_action,
                            reward, current_digitized_state)

            q.previous_digitized_state = current_digitized_state
            q.previous_action_idx = q.next_action_idx

        # 現在のフェーズが0か2でかつまだ秒数をセットしていなかったら、秒数をセットする
        if (light_phase == 0 or light_phase == 2) and not q.is_set_duration:
            traci.trafficlight.setPhaseDuration("0",
                                                q.action[q.next_action_idx])
            q.is_set_duration = True
            q.is_calculate_next_action = False
            print("set phase {} for {} seconds".format(
                light_phase, q.action[q.next_action_idx]))

        step += 1
        if step % 10000 == 0:
            plot_graph(q.rewards)

    traci.close()
    sys.stdout.flush()