def main(): # initialization of the grid grid = Grid() # creating markov decision and q learning classes for using algorithms mdp = MarkovDecision() ql = QLearning() print("\n") print("Value iteration results: \n") # parameter order: p, reward, discount utilities, policies = mdp.iterateValues(grid, 1, 0, 1) grid.printPolicies() grid.resetGrid() print("\n") print("Policy iteration results: \n") # parameter order: p, reward, discount mdp.iteratePolicies(grid, 1, 0, 1) grid.printPolicies() grid.resetGrid() print("\n") print("Q Learning results: \n") #parameter order: grid, p, reward, learning_rate, discount, epsilon, iteration, value_iteration results ql.learnQ(grid, 1, 0, 0.1, 1, 0, 1000, utilities, policies) grid.printQValues() grid.printQLearningResults()
def __init__(self): # rospy.init_node('robot_action') # Initialize publishers and subscribers rospy.on_shutdown(self.shutdown) self.bridge = cv_bridge.CvBridge() self.action_sub = rospy.Subscriber('/q_learning/robot_action', RobotMoveDBToBlock, self.action_received) self.camera_sub = rospy.Subscriber('/camera/rgb/image_raw', Image, self.camera_received) self.scan_sub = rospy.Subscriber('/scan', LaserScan, self.scan_received) self.vel_pub = rospy.Publisher('cmd_vel', Twist, queue_size=1) self.action_pub = rospy.Publisher('/q_learning/robot_action', RobotMoveDBToBlock, queue_size=3) # Keras pipeline self.pipeline = keras_ocr.pipeline.Pipeline() # Variables for the scan and image results self.current_scan = [] self.current_img = [] # Import the Q-Learning class functions self.q_learning = QLearning() rospy.sleep(1) # Set color ranges for red, green, and blue self.color_info = { #HSV upper and lower bounds for each color (need to test these ranges work) 'red': { 'lower': np.array([0, 250, 150]), 'upper': np.array([10, 255, 180]) }, 'blue': { 'lower': np.array([110, 250, 150]), 'upper': np.array([130, 255, 190]) }, 'green': { 'lower': np.array([45, 250, 150]), 'upper': np.array([75, 255, 190]) } } # Initialize robot arm components self.move_group_arm = moveit_commander.MoveGroupCommander("arm") self.move_group_gripper = moveit_commander.MoveGroupCommander( "gripper") # Set initial arm position self.move_group_arm.go([0.0, 0.9, -0.3, -0.79], wait=True) self.move_group_gripper.go([0.009, 0.009], wait=True) # Get the action sequence to maximize reward self.extract_action()
def main(argv): setpath() from q_learning import QLearning from grid_world import Grid # 5 rows, 4 cols, unreachables : [(1,1), (1,3)], pits : [(3,1)], goal : (4,3) g = Grid(5, 4, [(1, 1), (1, 3)], [(3, 1)], (4, 3)) q = QLearning(g) q.learn()
def assimilar(self, s, a, r, sn): QLearning.assimilar(self, s,a,r,sn) self.R[(s,a)] = r self.T[(s,a)] = sn for j in range(self.N): (s1, a1) = choice( self.Q.keys() ) r1 = self.R[ (s1,a1)] sn1 = self.T[ (s1,a1)] QLearning.assimilar(self, s1,a1,r1,sn1)
def main(argv): setpath() from q_learning import QLearning from grid_world import Grid # 5 rows, 4 cols, unreachables : [(1,1), (1,3)], pits : [(3,1)], goal : (4,3) g = Grid(5, 4, [(1,1),(1,3)], [(3,1)], (4,3)) q = QLearning(g) q.learn()
def test_learning(self): qlearning = QLearning() old_state = self.empty_state() action = ACTION_SUICIDE new_state = self.empty_state() reward = 10 old_reward = qlearning.q.get_q(old_state, action) qlearning.learn(old_state, new_state, action, reward) self.assertGreater(qlearning.q.get_q(old_state, action), old_reward)
def run_games(game_length, left_arm_mean, left_arm_std, n_players, right_arm_mean, right_arm_std, use_asrn, learning_rate = 0.01, gamma=0.95, epsilon=1.0, epsilon_decay=0.99): all_rewards = [] all_goods = [] all_losses = [] all_q_tables = [] trained_agent_q_values = [left_arm_mean / (1 - gamma), right_arm_mean / (1 - gamma)] for j in range(n_players): two_armed_bandit = BrokenArmedBandit(left_arm_mean=left_arm_mean, right_arm_mean=right_arm_mean, left_arm_std=left_arm_std, right_arm_std=right_arm_std) ## giving the real mean as initialization(!) left_initial_mean = trained_agent_q_values[0] right_initial_mean = trained_agent_q_values[1] q_learning = QLearning(left_initial_mean, right_initial_mean, learning_rate, gamma, epsilon, epsilon_decay) rewards = np.zeros((game_length, 1)) goods = np.zeros((game_length, 1)) losses = np.zeros((game_length, 1)) q_table = [] if use_asrn: asrn = BinsASRN(0, learning_period=game_length/10) for i in range(game_length): right, reward_estimation = q_learning.choose() good = q_learning.right_mean > q_learning.left_mean goods[i] = good q_table.append([q_learning.right_mean, q_learning.left_mean]) reward = two_armed_bandit.pull(right) rewards[i] = reward if use_asrn: if right: updated_right_mean = (1 - q_learning.learning_rate) * q_learning.right_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.right_mean) reward = asrn.noise(q_learning.right_mean, updated_right_mean, reward) else: updated_left_mean = (1 - q_learning.learning_rate) * q_learning.left_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.left_mean) reward = asrn.noise(q_learning.left_mean, updated_left_mean, reward) loss = q_learning.update(right, reward) losses[i] = loss all_rewards.append(rewards) all_goods.append(goods) all_losses.append(losses) all_q_tables.append(q_table) return all_q_tables, all_rewards, all_goods, np.asarray(all_losses)
def __init__(self, env, gamma, alpha): # sets self.env = env, state = None, next_waypoint = None, and a default color super(LearningAgent, self).__init__(env) self.color = 'red' # override color # simple route planner to get next_waypoint self.planner = RoutePlanner(self.env, self) # Initialize additional variables and QLearning here self.actions = ['forward', 'left', 'right', None] self.states = [] self.q_learn = QLearning(self.actions, states=self.states, gamma=gamma, alpha=alpha) self.p_action = None self.p_reward = None
def greedy_policy(): env = gym.make('FrozenLake-v0') q_learning = QLearning(env.action_space.n, env.observation_space.n) q_learning.set_general_state_action_values([0.5, 1, 0.5, 0.5]) for i_episode in range(20): observation = env.reset() for t in range(100): env.render() print(observation) action = q_learning.greedy_policy(observation) observation, reward, done, info = env.step(action) if done: print "Episode finished after {} timesteps".format(t + 1) break
def main(): env = Env() q_learning_agent = QLearning() if len(load_table_name) > 20: print(f'loading table {load_table_name}') q_learning_agent.load_table(load_table_name) print(env.check_car_pos()) assert env.check_car_pos() == (269, 116) updater = QLearningUpdater(q_learning_agent) epsilon_greedy = EpsilonGreedyAgent(q_learning_agent) if learn: rewards = train_agent(env, epsilon_greedy, q_learning_agent, updater) np.save('q_l_tables/rewards.npy', rewards) else: print('exploiting') play_agent(env, q_learning_agent)
def create_q_learning_solver(labyrinth): # Setting up the q-learning R and Q matrixes q_learning_config_dict = { 'labyrinth': labyrinth, 'learning_rate': constant.STD_LEARNING_RATE, 'exploration_chance': constant.STD_EXPLORATION_CHANCE } q_learning_solver = QLearning(q_learning_config_dict) return q_learning_solver
def q_learning_greedy_probability_policy(): env = gym.make('FrozenLake-v0') q_learning = QLearning(env.action_space.n, env.observation_space.n, epsilon=0.1, learning_rate=0.1) q_learning.set_general_state_action_values([0.5, 1, 0.5, 0.5]) episode_rewards = [] all_over_reward = 0.0 for i_episode in range(7000): # We start a new episode with have to reset the environment and stats observation = env.reset() accumulated_reward = 0.0 for t in range(100): # Show current state # env.render() # Choose action based on current experience action = q_learning.greedy_probability_policy(observation) # Save previous state, and commit action, resulting new current state previous_observation = observation observation, reward, done, info = env.step(action) # Accumulate more reward accumulated_reward += reward # Train algorithm based on new experience q_learning.update_state_action_function(previous_observation, action, reward, observation) # if done: print "Episode finished after {} timesteps".format(t + 1) print "Total reward for episode %i: %i" % (i_episode, accumulated_reward) all_over_reward += accumulated_reward episode_rewards.append(accumulated_reward) break plot = Plot() plot.plot_evolution(episode_rewards) print q_learning.q_table q_learning.write_q_function_dump()
def run_tests(): """ Runs all tests defined in task2_tests.yaml. Returns: results: Results dataframe. """ with open(FILENAME) as file: # Loads testing parameters from the yaml file. tests = yaml.safe_load(file) # create a dataframe to keep the results test_dict = tests['Tests'] results = pd.DataFrame(test_dict) results['Last Average Score'] = "" results['No of Q-Learning episodes'] = "" # run experiments: for i, test in enumerate(test_dict): grid = Rooms(test["env_size"], testing=True) learning = QLearning(grid, test["gamma"], test["alpha"], test["agent_start_pos"]) e_greedy = Policy("e-greedy", test["epsilon"], test["decay"]) greedy = Policy(policy_type="greedy") experiment = Experiments(grid, learning, greedy, test["iters"], test["agent_start_pos"], test["test_no"]) for session in range(test["iters"]): learning.run_multiple_episodes(test["batch_episodes"], e_greedy) mean_reward = experiment.run_experiments(test["exp_per_batch"]) results.loc[i, 'Last Average Score'] = mean_reward results.loc[i, 'No of Q-Learning episodes'] = (session + 1) * test["batch_episodes"] # save results to csv file filename = 'results/' + 'test_table.csv' results.to_csv(filename) # plot & save graphs experiment.generate_results(test["test_no"], test) return results
def main(algorithm, track, x_start, y_start, discount, learning_rate, threshold, max_iterations, epsilon=None, reset_on_crash=False): """ Program entry. Runs selected algorithm on selected track, at given coordinates, with given parameters :param algorithm: String :param track: List :param x_start: Int :param y_start: Int :param discount: Float :param learning_rate: Float :param threshold: Float :param max_iterations: Int :param epsilon: Float :param reset_on_crash: Boolean :return: None """ with open(track) as f: specs = f.readline().strip().split(',') rows = int(specs[0]) cols = int(specs[1]) layout = f.read().splitlines() initial_state = (x_start, y_start, 0, 0) initial_action = (0, 0) agent = Car(initial_action, epsilon) environment = RaceTrack(rows, cols, layout, initial_state, reset_on_crash=reset_on_crash) if algorithm == 'value_iteration': value_iterator = ValueIteration(discount, threshold, max_iterations, environment, agent) value_iterator.run() path = value_iterator.extract_policy(initial_state) value_iterator.plot_max_diffs() elif algorithm == 'q_learning': q_learner = QLearning(discount, learning_rate, threshold, max_iterations, environment, agent) path = q_learner.run() q_learner.plot_avg_cost() elif algorithm == 'sarsa': sarsa = Sarsa(discount, learning_rate, threshold, max_iterations, environment, agent) path = sarsa.run() sarsa.plot_avg_cost() else: print("No algorithm selected") return None draw_track(path, layout)
def __init__(self): # Publish Q-matrix updates to Q-matrix topic self.q_matrix_pub = rospy.Publisher("/RobotFinalProject/QMatrix", QMatrix, queue_size=10) # Publish to /robot_action for training the Q-matrices self.q_learning = QLearning() self.robot_action_pub = rospy.Publisher( "/RobotFinalProject/robot_action", RobotTasksDoors, queue_size=10) # Subscribe to environment to receive reward updates self.reward = rospy.Subscriber("/RobotFinalProject/reward", QLearningReward, self.update_q_matrix) rospy.sleep(2) # Initialize Q-matrices, one for each robot self.q_matrix_msg_a = QMatrix() self.q_matrix_msg_b = QMatrix() # For ease of use in training, we'll store the Q-matrices in a numpy array self.q_matrix_arr_a = np.zeros((128, 7)) self.q_matrix_arr_b = np.zeros((128, 7)) # Keep track of the 5 most recently updated Q-values per Q-matrix self.q_history_a = [] self.q_history_b = [] # Keeps track of current (starting) state, initialized to 0th state. This state accounts for both robots self.current_state = 0 # Get new state and action from starting state new_state, action = self.get_random_action(self.current_state) # Current action is action from above (action from current state to new state) self.current_action = action # Keep track of new state self.new_state = new_state # Move the robot according to the current action self.complete_task_or_door(self.current_action)
def __init__(self, simulation_run, seed: int): logger.debug("Start initializing this: %s", self.__class__) # self.SimulationManager = simulation_run.SimulationManager sim_manager = simulation_run.SimulationManager self.market_size = sim_manager.marketSize self.competition_type = sim_manager.competition_type self.examination_interval_size = sim_manager.sizeOfExaminationInterval self.price = 0.0 self.quantity = 0.0 self.profit = 0.0 self.qlearning = QLearning(seed, sim_manager) # holds the firm's individual Q-learning algorithm self.get_other_firm = simulation_run.getOtherFirm # ArrayLists to store its parameters over the time self.prices = deque(maxlen=self.examination_interval_size) self.quantities = deque(maxlen=self.examination_interval_size) self.profits = deque(maxlen=self.examination_interval_size) self.cournot = Cournot() logger.debug("Init completed: %s", self.__class__)
def __init__(self): # Publish Q-matrix updates to Q-matrix topic self.q_matrix_pub = rospy.Publisher("/q_learning/QMatrix", QMatrix, queue_size=10) # Publish to /robot_action for phantom robot movement self.q_learning = QLearning() self.action_pub = rospy.Publisher("/q_learning/robot_action", RobotMoveDBToBlock, queue_size=10) # Subscribe to environment to receive reward updates self.reward = rospy.Subscriber("/q_learning/reward", QLearningReward, self.update_q_matrix) rospy.sleep(2) # Initialize Q-matrix self.q_matrix_msg = QMatrix() # For ease of use in training, we'll store the Q-matrix in a numpy array self.q_matrix_arr = np.zeros((64, 9)) # Keep track of the 5 most recently updated Q-values self.q_history = [] # Keeps track of current (starting) state, initialized to 0th state self.current_state = 0 # Get new state and action from starting state new_state, action = self.get_random_action(self.current_state) # Current action is action from above (action from current state to new state) self.current_action = action # Keep track of new state self.new_state = new_state # Move the robot according to the current action self.move_DB(self.current_action ) # update_q_matrix() will be called as callback
def __init__(self, total_episodes: int): self.window_width = constant.WIDTH * constant.TILE self.window_height = constant.HEIGHT * constant.TILE self._running = True self._display = None self._snake = None self._mouse = None self.episode = 1 self.total_episodes = total_episodes self.score = 0 self.max_score = 0 self.frames = 0 self.game_stats = [] self.specs = [] self.test_run = False self.snake = Snake() self.mouse = Mouse(constant.WIDTH, constant.HEIGHT, self.snake.body_position()) self.q = QLearning()
def run_q_learning_forest(S, r1, r2): forest = WrappedForest(S, r1, r2) n_episodes = 10000 how_often = n_episodes / 100 stats = IterationStats('stats/ql_forest.csv', dims=2) def on_episode(episode, time, q_learner, q): forest.print_policy(print, q_learner.get_policy()) stats.save_iteration(episode, time, numpy.nanmean(numpy.nanmax(q, axis=0)), q) def is_done(state, action, next_state): if next_state.state_num == 0: return True return False gamma = 0.99 start = time.time() numpy.random.seed(5263228) q_l = QLearning(forest, 0.5, 0.2, gamma, on_episode=on_episode, start_at_0=True, alpha=0.1, is_done=is_done, every_n_episode=how_often) stats.start_writing() q_l.run(n_episodes) stats.done_writing() forest.print_policy(print, q_l.get_policy()) print('took {} s'.format(time.time() - start)) stats = IterationStats('stats/ql_forest.csv', dims=2) analysis.create_iteration_value_graph( stats, 'average Q', 'Average Q for each iteration on Forest Q Learning', 'forest_results')
env_ranges = [(-0.5, 0.5), (0, 1), (-0.5, 0.5), (-0.5, 0.5), (-0.4, 0.4), (-0.4, 0.4), (-1, 1), (-1, 1)] print('\nObservation ranges:') for ob in enumerate(env_ranges): print(ob[0], ob[1]) num_bins = [3, 20, 3, 6, 6, 6, 3, 3] num_pos_actions = len(actions) q_learning = QLearning(env=env, num_bins=num_bins, num_pos_actions=num_pos_actions, env_ranges=env_ranges, discount=0, episodes=0, epsilon=None, lr=None, USE=True) env = gym.make('LunarLander-v2') q_learning.q_table = np.load('./data_lunarlander/0_9000.npy') for _ in range(10): obs = q_learning.reset_state() # Reset the environment and get the initial done = False while not done:
from q_learning import QLearning from SARSA import SARSALearning from eligibility_traces import EligibilityTraces from function_approximation import FApprox from mountain_cart import run_methods, self_iterate import pickle if __name__ == "__main__": # Initialize a method methods = [ QLearning("MountainCar-v0", print_progress=False), SARSALearning("MountainCar-v0", print_progress=False), FApprox("MountainCar-v0", print_progress=False), EligibilityTraces("MountainCar-v0", print_progress=False) ] # Run the tests run_methods(methods) method = methods[0] method.q_table = pickle.load( open("Best_Method_" + str(type(method).__name__) + ".p", "rb")) method.evaluate() method.display() self_iterate(methods[0])
from q_learning import QLearning from SARSA import SARSALearning from ElgibilityTraces import ElgibilityTraces from function_approximation import FApprox import matplotlib.pyplot as plt import numpy as np if __name__ == "__main__": # Initialize a method methods = [ QLearning("MountainCar-v0"), SARSALearning("MountainCar-v0"), FApprox("MountainCar-v0"), ElgibilityTraces("MountainCar-v0") ] method = methods[3] method.train(1000) # Test the method method.evaluate() method.plot() #for method in methods: # graph = np.zeros(method.env._max_episode_steps / 10) ## runs = 30 # for run in range(1, runs + 1): # method.reset_model() # method.train(1000) # graph += method.convergence_graph / runs
state_, reward, done = env.step(action) step_count += 1 # 增加步数 # 机器人大脑从这个过渡(transition) (state, action, reward, state_) 中学习 RL.learn(str(state), action, reward, str(state_)) # 机器人移动到下一个 state state = state_ # 如果踩到炸弹或者找到宝藏, 这回合就结束了 if done: print("回合 {} 结束. 总步数 : {}\n".format(episode + 1, step_count)) break # 结束游戏并关闭窗口 print('游戏结束') env.destroy() if __name__ == "__main__": # 创建环境 env 和 RL env = Maze() RL = QLearning(actions=list(range(env.n_actions))) # 开始可视化环境 env.after(100, update) env.mainloop() print('\nQ 表:') print(RL.q_table)
def __init__(self): self.qlearning = QLearning()
self.root_frame.q_learning_simulator = None pygame.display.quit() def draw_robot(self, i, j): top_left = (self.state_width * j, self.state_height * i) self.robot_rect = [ top_left[0], top_left[1], self.state_width, self.state_height ] self.screen.blit(self.robot, self.robot_rect) def draw_world(self): for i in range(len(self.world)): for j in range(len(self.world[i])): self.draw_rectangle(i, j, self.world[i][j]) def draw_rectangle(self, i, j, item): top_left = (self.state_width * j, self.state_height * i) color = self.item_colors[item] thickness = self.item_thickness[item] pygame.draw.rect( self.screen, color, (top_left[0], top_left[1], self.state_width, self.state_height), thickness) if __name__ == '__main__': file_name = "./board.txt" q_learning = QLearning("./board.txt", is_stochastic=True) QLearningSimulator(q_learning, delay_time=100)
def __init__(self): # ** deveriamos passar os parametros psa para anular as dependencais self.N = 50 self.R = {} self.T = {} QLearning.__init__( self)
print(ob[0], ob[1]) num_bins = [10, 3] num_pos_actions = len(actions) print('num_pos_actions', num_pos_actions) # hyperparams: discount = 1.0 episodes = 100 epsilon = [0.5, 1.0, episodes // 2] # Epsilon start, start decay index, stop decay index lr = [0.5, 1.0, episodes // 2 ] # Learning rate start, start decay index, stop decay index q_learning = QLearning(env, num_bins, num_pos_actions, env_ranges, discount, episodes, epsilon, lr) print('q-table shape', q_learning.q_table.shape) obs = q_learning.reset_state() # Reset the environment and get the initial obs = [obs[i] for i in obs_to_use] print('\nInitial observation:', obs) action_to_maximise_q = q_learning.action_to_maximise_q( obs) # Find optimal action action = q_learning.decide_on_action( action_to_maximise_q) # Decide whether to use optimal or random action observation, reward_current, done = q_learning.perform_sim_step( action) # env.step(action) # Perform the first action NUM_TO_SHOW = 5
class Robot: game = None last_game = None current_state = None last_states = {} last_action = {} delta = None # Keep track of all our robot_ids. Will be useful to detect new robots. robot_ids = set() # default values - will get overridden by rgkit location = (0, 0) hp = 0 player_id = 0 robot_id = 0 def __init__(self): self.qlearning = QLearning() def act(self, game): new_robot = self.robot_id in self.robot_ids self.robot_ids.add(self.robot_id) self.current_state = State.from_game(game, self.player_id, robot_loc=self.location) self.game = game # Explore function if random.randint(0, 3) < 1: print("[Bot " + str(self.robot_id) + "] random action") # print self.state action = self.get_random_action() else: action = self.qlearning.predict(self.current_state) self.last_states[self.robot_id] = self.current_state self.last_action[self.robot_id] = action return State.map_action(action, self.location) def get_possible_actions(self): possible_moves = [s.ACTION_SUICIDE] possible_locs = rg.locs_around(self.location, filter_out=('invalid', 'obstacle')) for loc in possible_locs: possible_moves.append((s.ACTION_MOVE, loc)) possible_moves.append((s.ACTION_ATTACK, loc)) return possible_moves def get_random_action(self): possible_action = self.get_possible_actions() return random.choice(possible_action) # delta = [AttrDict{ # 'loc': loc, # 'hp': hp, # 'player_id': player_id, # 'loc_end': loc_end, # 'hp_end': hp_end # }] # returns new GameState def learn(self): my_delta = [d for d in self.delta if d.loc == self.location] last_state = self.last_states[self.robot_id] action = self.last_action[self.robot_id] future_state = self.current_state reward = self.reward(my_delta) self.qlearning.learn(last_state, future_state, action, reward) @staticmethod def reward(my_delta): damage_taken = my_delta.hp - my_delta.hp_end reward = my_delta.damage_caused - damage_taken return reward def delta_callback(self, delta, actions, new_gamestate): future_game = new_gamestate.get_game_info(self.player_id) print "delta_callback calle" print("Size of Q: " + str(len(self.qlearning.q.hash_matrix))) for (robot_loc, robot) in self.game.robots.items(): if hasattr(robot, 'robot_id') and robot.robot_id in self.robot_ids: action = self.last_action[robot.robot_id] for delta_me in delta: state_template = State.from_game(future_game, self.player_id) if delta_me['loc'] == robot_loc: future_state = copy.deepcopy(state_template) future_state.robot_loc = delta_me.loc_end reward = self.reward(delta_me) self.qlearning.learn(self.current_state, future_state, action, reward)
def test_pickle(self): qlearning = QLearning() qlearning.save("test_q.pickle") qlearning_load = QLearning.load("test_q.pickle") assert qlearning == qlearning_load
def main(argv): # Get command line arguments try: opts, args = getopt.getopt(argv, "pi:") except getopt.GetoptError: print("main.py [-p] [-i=n_iter]") sys.exit(1) # Plot switch plot = False # Default iterations n_iter = 100000 # Parse command line arguments for opt, arg in opts: # Help if opt == "-h": print("main.py [-p] [-i=n_iter]") sys.exit() # Plot elif opt == "-p": plot = True # Number of iterations elif opt == "-i": n_iter = arg # Construct grid grid = construct_grid() # Find solution with value iteration print("Performing value iteration...") vi = ValueIteration(grid) vi = vi.solve() print("----------") print("The value for each cell is:") print("(x,y): value") for cell in grid: print(cell.name_x()+","+cell.name_y()+": "+\ str(vi[0][cell.get_name()])) print("----------") print("The policy found by value iteration is:") print("(x,y): action") for cell in grid: print(cell.name_x()+","+cell.name_y()+": "+\ str(vi[1][cell.get_name()])) # Find solution with Q-learning print("\n") print("Performing Q-learning...") ql = QLearning(grid, N_iter = int(n_iter)) ql = ql.solve() ql_states = ql[0] ql_Q = ql[1] print("----------") print("The policy found by Q-learning is.") print("(x,y): action") actions = ["north","east","south","west"] for cell in grid: ind = ql_states.index(cell.get_name()) action = actions[np.argmax(ql_Q[ind,])] print(cell.name_x()+","+cell.name_y()+": "+str(action)) if plot: # Convergence graph for q-learning: # best Q-value of the best action for the START state fig, ax = plt.subplots() ax.plot(ql[2][0:ql[3]], label = "Q-values for best action in START") ax.plot((0,ql[3]),(vi[0]["11"],vi[0]["11"]), label = "Values of START") plt.xlabel("Iterations") plt.show()
time.sleep(1) class GraphHolder: def __init__(self): self.graph = np.zeros(50) self.number = 100 threadList = [] nameList = [] graph = GraphHolder() for i in range(graph.number): threadList.append("Thread-" + str(i)) nameList.append(QLearning("MountainCar-v0")) queueLock = threading.Lock() workQueue = queue.Queue() threads = [] threadID = 1 # Create new threads for tName in threadList: thread = myThread(threadID, tName, workQueue, graph) thread.start() threads.append(thread) threadID += 1 # Fill the queue queueLock.acquire()
class RobotAction(object): def __init__(self): # rospy.init_node('robot_action') # Initialize publishers and subscribers rospy.on_shutdown(self.shutdown) self.bridge = cv_bridge.CvBridge() self.action_sub = rospy.Subscriber('/q_learning/robot_action', RobotMoveDBToBlock, self.action_received) self.camera_sub = rospy.Subscriber('/camera/rgb/image_raw', Image, self.camera_received) self.scan_sub = rospy.Subscriber('/scan', LaserScan, self.scan_received) self.vel_pub = rospy.Publisher('cmd_vel', Twist, queue_size=1) self.action_pub = rospy.Publisher('/q_learning/robot_action', RobotMoveDBToBlock, queue_size=3) # Keras pipeline self.pipeline = keras_ocr.pipeline.Pipeline() # Variables for the scan and image results self.current_scan = [] self.current_img = [] # Import the Q-Learning class functions self.q_learning = QLearning() rospy.sleep(1) # Set color ranges for red, green, and blue self.color_info = { #HSV upper and lower bounds for each color (need to test these ranges work) 'red': { 'lower': np.array([0, 250, 150]), 'upper': np.array([10, 255, 180]) }, 'blue': { 'lower': np.array([110, 250, 150]), 'upper': np.array([130, 255, 190]) }, 'green': { 'lower': np.array([45, 250, 150]), 'upper': np.array([75, 255, 190]) } } # Initialize robot arm components self.move_group_arm = moveit_commander.MoveGroupCommander("arm") self.move_group_gripper = moveit_commander.MoveGroupCommander( "gripper") # Set initial arm position self.move_group_arm.go([0.0, 0.9, -0.3, -0.79], wait=True) self.move_group_gripper.go([0.009, 0.009], wait=True) # Get the action sequence to maximize reward self.extract_action() # Extract 3 maximum reward actions from the Q-matrix def extract_action(self): # Track the number of the actions maximizing reward actions = [] # Read in the Q-matrix q_matrix_arr = self.q_learning.read_q_matrix() # Set the starting state to 0 states = self.q_learning.states state = 0 # Iterate 3 times since the Q-matrix only produces a sequence of 3 consecutive actions for i in range(3): max_reward = -1 best_action = 0 # Find the best action for the current state based on its Q-value and save it for (action, reward) in enumerate(q_matrix_arr[state]): if reward > max_reward: max_reward = reward best_action = action actions.append(best_action) # Find the next state that this action leads to from the current state next_state = 0 for (s, a) in enumerate(self.q_learning.action_matrix[state]): if int(a) == best_action: next_state = s break state = next_state # Convert the action numbers into action messages action_msgs = [] for a in actions: #construct action message msg = RobotMoveDBToBlock() msg.block_id = self.q_learning.actions[a]['block'] msg.robot_db = self.q_learning.actions[a]['dumbbell'] action_msgs.append(msg) # Publish and queue up the actions while len(action_msgs) > 0: msg = action_msgs.pop(0) self.action_pub.publish(msg) # Callback function for when an action is received def action_received(self, data): # Initialize color and block color = data.robot_db block = data.block_id # Scan around for appropriate dumbbell self.locate_dumbell(color) # Move to dumbbell self.move_to_dumbell(color) # Lift dumbbell self.lift_dumbbell() # Move it to the corresponding block self.move_to_block(block) # Put down the dumbbell self.drop_dumbbell() # Save the camera feed and image def camera_received(self, data): image = self.bridge.imgmsg_to_cv2(data, desired_encoding='bgr8') image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) #convert to HSV self.current_img = image def locate_dumbell(self, color): #spin around until dumbell color is in sight found = False msg = Twist() msg.angular.z = np.pi / 12.0 #turn until color appears in camera upper, lower = self.color_info[color]['upper'], self.color_info[color][ 'lower'] while not found: mask = cv2.inRange(self.current_img, lower, upper) #code from line follower to detect color w, h = mask.shape mask = mask[w // 3:2 * w // 3, h // 3:2 * h // 3] # print(np.sum(mask),np.sum(self.current_img)) if np.sum(mask) > 0: #if this is true, then color detected found = True self.vel_pub.publish(Twist()) else: self.vel_pub.publish(msg) def move_to_dumbell(self, color): upper, lower = self.color_info[color]['upper'], self.color_info[color][ 'lower'] front_dist = np.inf stop_dist = 0.20 while front_dist > stop_dist: #TODO: stop when close enough to dumbell mask = cv2.inRange(self.current_img, lower, upper) twist = Twist() M = cv2.moments(mask) h, w, d = self.current_img.shape img = cv2.cvtColor(self.current_img, cv2.COLOR_HSV2BGR) if M['m00'] > 0: cx = int(M['m10'] / M['m00']) cy = int(M['m01'] / M['m00']) # print(cx,cy) err = w / 2 - cx if err < 100: front_dist = self.current_scan.ranges[ 0] #only adjust linear when aligned to dumbell k_p = 1.0 / 100.0 k_lin = 1.0 twist.linear.x = k_lin * min(0.5, max(0, front_dist - stop_dist)) twist.angular.z = k_p * err self.vel_pub.publish(twist) def move_to_block(self, block): dig_map = { #commonly returned labels for digits 1: {'1', 'i', 'l'}, 2: {'2', 's'}, 3: {'3', 't', '5'} } found = False #first turn until facing the correct block msg = Twist() msg.angular.z = -1.0 * np.pi / 8.0 #turn until color appears in camera while not found: pred = self.pipeline.recognize([self.current_img ])[0] #get object predictions for (label, bb) in pred: print(label, bb) if label in dig_map[block]: #digit dectected found = True if not found: #if not found, turn robot to get new view r = rospy.Rate(2) for _ in range(4): self.vel_pub.publish(msg) r.sleep() self.vel_pub.publish(Twist()) self.vel_pub.publish(Twist()) front_dist = np.inf stop_dist = 0.35 while front_dist > stop_dist: #now that block is in frame, move to block msg = Twist() h, w, d = self.current_img.shape correct_box_idx = 0 max_area = 0 pred = self.pipeline.recognize([self.current_img])[0] if pred: for i in range(len(pred)): label, bb = pred[i][0], pred[i][1] if label in dig_map[block]: area = (bb[0][0] - bb[2][0]) * (bb[1][1] - bb[3][1]) if area > max_area: #make sure we're looking at the right face (will have largest area) max_area = area correct_box_idx = i bbox = pred[correct_box_idx][1] cx = np.mean(bbox[:, 0]) #this will get center of bounding box # print(bbox, pred[correct_box_idx][0],cx,correct_box_idx) err = w / 2 - cx #offset from bounding box center if err < 100: front_dist = self.current_scan.ranges[0] k_p = 1.0 / 400.0 k_lin = 0.5 msg.linear.x = k_lin * min(0.5, max( 0, front_dist - stop_dist)) #proportional control msg.angular.z = k_p * err r = rospy.Rate(2) for _ in range(2): self.vel_pub.publish(msg) r.sleep() self.vel_pub.publish(Twist()) else: break def lift_dumbbell(self): # Close gripper around dumbbell gripper_joint_goal = [0.006, 0.006] self.move_group_gripper.go(gripper_joint_goal, wait=True) self.move_group_gripper.stop() # Lift arm up arm_joint_goal = [0.0, 0.05, -0.15, -0.79] self.move_group_arm.go(arm_joint_goal, wait=True) # Calling ``stop()`` ensures that there is no residual movement self.move_group_arm.stop() def drop_dumbbell(self): # Drop arm down arm_joint_goal = [0.0, 0.9, -0.3, -0.79] self.move_group_arm.go(arm_joint_goal, wait=True) # Calling ``stop()`` ensures that there is no residual movement self.move_group_arm.stop() # Open gripper around dumbbell gripper_joint_goal = [0.009, 0.009] self.move_group_gripper.go(gripper_joint_goal, wait=True) self.move_group_gripper.stop() # Move backwards away from the dumbbell so robot can rotate msg = Twist() msg.linear.x = -1 self.vel_pub.publish(msg) rospy.sleep(1) self.vel_pub.publish(Twist()) # Sets the current_scan variable to the incoming scan data def scan_received(self, data): self.current_scan = data # Runs the program def run(self): rospy.spin() # Shuts down the program def shutdown(self): self.vel_pub.publish(Twist())
import gym import numpy as np import torch import sys from matplotlib import pyplot as plt sys.path.insert(0, "../../agents") from q_learning import QLearning # pylint: disable=import-error EPISODES = 5000 if __name__ == "__main__": env = gym.make("FrozenLake-v0") agent = QLearning(env, epsilon=0.8, gamma=0.5, lr=0.01) episode_rew = [] for episode in range(EPISODES): # Deciding first action action = env.action_space.sample() state = env.reset() ep_rew = 0 while True: next_state, reward, done, _ = env.step(action) # env.render() ep_rew += reward agent.update((state, action, reward, next_state)) state = next_state agent.get_action(state)
def run_q_learning_grid_world(): world = GridWorld('simple_grid.txt', -0.01, include_treasure=False) n_episodes = 500000 how_often = n_episodes / 500 stats = IterationStats('stats/ql_simple_grid.csv', dims=5) def on_update(state, action, next_state, q_learner): #print('[{},{}] - {} -> [{},{}]'.format(state.x, state.y, action[0], next_state.x, next_state.y)) pass def on_episode(episode, time, q_learner, q): world.print_policy(print, q_learner.get_policy()) stats.save_iteration(episode, time, numpy.nanmean(numpy.nanmax(q, axis=0)), q) #time.sleep(1) for state in world.get_states(): if state.tile_type == GridWorldTile.GOAL: goal_state = state break def initialize_toward_goal(state: GridWorldTile): actions = state.get_actions() if len(actions) == 0: return [] diff_x = goal_state.x - state.x diff_y = goal_state.y - state.y best_value = 0.1 if len(actions) == 5 and actions[4][0].startswith('get treasure'): best_action = actions[4][0] elif abs(diff_x) >= abs(diff_y): if diff_x > 0: best_action = 'move east' else: best_action = 'move west' else: if diff_y < 0: best_action = 'move north' else: best_action = 'move south' values = [-0.1] * len(actions) for i, action in enumerate(actions): if action[0] == best_action: values[i] = best_value return values gamma = 0.99 q_l = QLearning(world, 0.5, 0.05, gamma, on_update=on_update, on_episode=on_episode, initializer=initialize_toward_goal, start_at_0=True, alpha=0.1, every_n_episode=how_often) stats.start_writing() q_l.run(n_episodes) stats.done_writing() world.print_policy(print, q_l.get_policy())
TRAINING_ON = True EPISODES = 1000 Q_MODEL_PATH = "outputs/keras-models/2048_q_model.h5" Q_MODEL_WEIGHTS_PATH = "outputs/keras-models/2048_q_model_weights.h5" T_MODEL_PATH = "outputs/keras-models/2048_t_model.h5" board_size = int(math.sqrt(env.observation_space.shape[0])) n_output = env.action_space.n # In[3]: QL = QLearning(n_x=board_size, n_y=n_output, q_save_path=Q_MODEL_PATH, q_weights_save_path=Q_MODEL_WEIGHTS_PATH, t_save_path=T_MODEL_PATH, total_episodes=EPISODES, restore_model=True, is_training_on=TRAINING_ON, T=10) # In[ ]: for episode in range(EPISODES): observation = env.reset() QL.curr_episode = episode while True: if RENDER_ENV: env.render() valid_move = False
# -*- coding: utf-8 -*- """ Created on Thu Apr 26 00:02:49 2018 @author: Shiratori """ import graphics as gfx import pong_model as pm from q_learning import QLearning if __name__ == '__main__': # Set up environment environment = pm.PongModel(0.5, 0.5, -0.03, 0.01, 0.4, paddleX=0) window = gfx.GFX(wall_x=1, player_x=0) window.fps = 9e16 # Set up model model = QLearning(environment, window, C=5e3, gamma=0.99, explore=-1, threshold=-1, log=True, log_file='q_test_log_(agent_B).txt', mode='test', q_table_file='q_q_table_(agent_B).csv') # Training model.train()
def run(): """execute the TraCI control loop""" step = 0 # initialize QLearning num_phase = 2 max_num_car_stopped = 10 num_lane = 4 num_wait_time_category = 10 num_action = 10 q = QLearning(num_phase, max_num_car_stopped, num_lane, num_action) # we start with phase 2 where EW has green #traci.trafficlight.setPhase("0", 2) while traci.simulation.getMinExpectedNumber() > 0: traci.simulationStep() #next_action_idx = 9 # 現在の信号のフェーズ light_phase = traci.trafficlight.getPhase("0") # 現在のフェーズが黄色かつまだ次のアクションを決めていなかったら、次のフェーズの秒数を決める if (light_phase == 1 or light_phase == 3) and not q.is_calculate_next_action: q.is_set_duration = False # 次に信号が取るフェーズを取得 next_light_phase = 0 if light_phase == 1: next_light_phase = 2 # それぞれのレーンで停まっている車の数 count_0 = min(traci.lanearea.getLastStepHaltingNumber("0"), 9) count_1 = min(traci.lanearea.getLastStepHaltingNumber("1"), 9) count_2 = min(traci.lanearea.getLastStepHaltingNumber("2"), 9) count_3 = min(traci.lanearea.getLastStepHaltingNumber("3"), 9) # 次の信号のフェーズと現在の混雑状況 current_state_dict = { 'light_phase': next_light_phase, 'nums_car_stopped': [count_0, count_1, count_2, count_3] } current_digitized_state = q.digitize_state(current_state_dict) q.next_action_idx = q.get_action(current_digitized_state) q.is_calculate_next_action = True # reward reward = -np.sum( [x**1.5 for x in [count_0, count_1, count_2, count_3]]) q.rewards.append(reward) # 各青赤フェーズが終了したタイミングで、以前の状況に対してとったアクションに対するリワードを計算するため、このタイミングで、前回のstateとactionに対するリワードを計算する? q.update_Qtable(q.previous_digitized_state, q.previous_action, reward, current_digitized_state) q.previous_digitized_state = current_digitized_state q.previous_action_idx = q.next_action_idx # 現在のフェーズが0か2でかつまだ秒数をセットしていなかったら、秒数をセットする if (light_phase == 0 or light_phase == 2) and not q.is_set_duration: traci.trafficlight.setPhaseDuration("0", q.action[q.next_action_idx]) q.is_set_duration = True q.is_calculate_next_action = False print("set phase {} for {} seconds".format( light_phase, q.action[q.next_action_idx])) step += 1 if step % 10000 == 0: plot_graph(q.rewards) traci.close() sys.stdout.flush()