def main(): # initialization of the grid grid = Grid() # creating markov decision and q learning classes for using algorithms mdp = MarkovDecision() ql = QLearning() print("\n") print("Value iteration results: \n") # parameter order: p, reward, discount utilities, policies = mdp.iterateValues(grid, 1, 0, 1) grid.printPolicies() grid.resetGrid() print("\n") print("Policy iteration results: \n") # parameter order: p, reward, discount mdp.iteratePolicies(grid, 1, 0, 1) grid.printPolicies() grid.resetGrid() print("\n") print("Q Learning results: \n") #parameter order: grid, p, reward, learning_rate, discount, epsilon, iteration, value_iteration results ql.learnQ(grid, 1, 0, 0.1, 1, 0, 1000, utilities, policies) grid.printQValues() grid.printQLearningResults()
def __init__(self): # rospy.init_node('robot_action') # Initialize publishers and subscribers rospy.on_shutdown(self.shutdown) self.bridge = cv_bridge.CvBridge() self.action_sub = rospy.Subscriber('/q_learning/robot_action', RobotMoveDBToBlock, self.action_received) self.camera_sub = rospy.Subscriber('/camera/rgb/image_raw', Image, self.camera_received) self.scan_sub = rospy.Subscriber('/scan', LaserScan, self.scan_received) self.vel_pub = rospy.Publisher('cmd_vel', Twist, queue_size=1) self.action_pub = rospy.Publisher('/q_learning/robot_action', RobotMoveDBToBlock, queue_size=3) # Keras pipeline self.pipeline = keras_ocr.pipeline.Pipeline() # Variables for the scan and image results self.current_scan = [] self.current_img = [] # Import the Q-Learning class functions self.q_learning = QLearning() rospy.sleep(1) # Set color ranges for red, green, and blue self.color_info = { #HSV upper and lower bounds for each color (need to test these ranges work) 'red': { 'lower': np.array([0, 250, 150]), 'upper': np.array([10, 255, 180]) }, 'blue': { 'lower': np.array([110, 250, 150]), 'upper': np.array([130, 255, 190]) }, 'green': { 'lower': np.array([45, 250, 150]), 'upper': np.array([75, 255, 190]) } } # Initialize robot arm components self.move_group_arm = moveit_commander.MoveGroupCommander("arm") self.move_group_gripper = moveit_commander.MoveGroupCommander( "gripper") # Set initial arm position self.move_group_arm.go([0.0, 0.9, -0.3, -0.79], wait=True) self.move_group_gripper.go([0.009, 0.009], wait=True) # Get the action sequence to maximize reward self.extract_action()
def main(argv): setpath() from q_learning import QLearning from grid_world import Grid # 5 rows, 4 cols, unreachables : [(1,1), (1,3)], pits : [(3,1)], goal : (4,3) g = Grid(5, 4, [(1, 1), (1, 3)], [(3, 1)], (4, 3)) q = QLearning(g) q.learn()
def create_q_learning_solver(labyrinth): # Setting up the q-learning R and Q matrixes q_learning_config_dict = { 'labyrinth': labyrinth, 'learning_rate': constant.STD_LEARNING_RATE, 'exploration_chance': constant.STD_EXPLORATION_CHANCE } q_learning_solver = QLearning(q_learning_config_dict) return q_learning_solver
def run_games(game_length, left_arm_mean, left_arm_std, n_players, right_arm_mean, right_arm_std, use_asrn, learning_rate = 0.01, gamma=0.95, epsilon=1.0, epsilon_decay=0.99): all_rewards = [] all_goods = [] all_losses = [] all_q_tables = [] trained_agent_q_values = [left_arm_mean / (1 - gamma), right_arm_mean / (1 - gamma)] for j in range(n_players): two_armed_bandit = BrokenArmedBandit(left_arm_mean=left_arm_mean, right_arm_mean=right_arm_mean, left_arm_std=left_arm_std, right_arm_std=right_arm_std) ## giving the real mean as initialization(!) left_initial_mean = trained_agent_q_values[0] right_initial_mean = trained_agent_q_values[1] q_learning = QLearning(left_initial_mean, right_initial_mean, learning_rate, gamma, epsilon, epsilon_decay) rewards = np.zeros((game_length, 1)) goods = np.zeros((game_length, 1)) losses = np.zeros((game_length, 1)) q_table = [] if use_asrn: asrn = BinsASRN(0, learning_period=game_length/10) for i in range(game_length): right, reward_estimation = q_learning.choose() good = q_learning.right_mean > q_learning.left_mean goods[i] = good q_table.append([q_learning.right_mean, q_learning.left_mean]) reward = two_armed_bandit.pull(right) rewards[i] = reward if use_asrn: if right: updated_right_mean = (1 - q_learning.learning_rate) * q_learning.right_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.right_mean) reward = asrn.noise(q_learning.right_mean, updated_right_mean, reward) else: updated_left_mean = (1 - q_learning.learning_rate) * q_learning.left_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.left_mean) reward = asrn.noise(q_learning.left_mean, updated_left_mean, reward) loss = q_learning.update(right, reward) losses[i] = loss all_rewards.append(rewards) all_goods.append(goods) all_losses.append(losses) all_q_tables.append(q_table) return all_q_tables, all_rewards, all_goods, np.asarray(all_losses)
def q_learning_greedy_probability_policy(): env = gym.make('FrozenLake-v0') q_learning = QLearning(env.action_space.n, env.observation_space.n, epsilon=0.1, learning_rate=0.1) q_learning.set_general_state_action_values([0.5, 1, 0.5, 0.5]) episode_rewards = [] all_over_reward = 0.0 for i_episode in range(7000): # We start a new episode with have to reset the environment and stats observation = env.reset() accumulated_reward = 0.0 for t in range(100): # Show current state # env.render() # Choose action based on current experience action = q_learning.greedy_probability_policy(observation) # Save previous state, and commit action, resulting new current state previous_observation = observation observation, reward, done, info = env.step(action) # Accumulate more reward accumulated_reward += reward # Train algorithm based on new experience q_learning.update_state_action_function(previous_observation, action, reward, observation) # if done: print "Episode finished after {} timesteps".format(t + 1) print "Total reward for episode %i: %i" % (i_episode, accumulated_reward) all_over_reward += accumulated_reward episode_rewards.append(accumulated_reward) break plot = Plot() plot.plot_evolution(episode_rewards) print q_learning.q_table q_learning.write_q_function_dump()
def __init__(self, env, gamma, alpha): # sets self.env = env, state = None, next_waypoint = None, and a default color super(LearningAgent, self).__init__(env) self.color = 'red' # override color # simple route planner to get next_waypoint self.planner = RoutePlanner(self.env, self) # Initialize additional variables and QLearning here self.actions = ['forward', 'left', 'right', None] self.states = [] self.q_learn = QLearning(self.actions, states=self.states, gamma=gamma, alpha=alpha) self.p_action = None self.p_reward = None
def main(): env = Env() q_learning_agent = QLearning() if len(load_table_name) > 20: print(f'loading table {load_table_name}') q_learning_agent.load_table(load_table_name) print(env.check_car_pos()) assert env.check_car_pos() == (269, 116) updater = QLearningUpdater(q_learning_agent) epsilon_greedy = EpsilonGreedyAgent(q_learning_agent) if learn: rewards = train_agent(env, epsilon_greedy, q_learning_agent, updater) np.save('q_l_tables/rewards.npy', rewards) else: print('exploiting') play_agent(env, q_learning_agent)
def greedy_policy(): env = gym.make('FrozenLake-v0') q_learning = QLearning(env.action_space.n, env.observation_space.n) q_learning.set_general_state_action_values([0.5, 1, 0.5, 0.5]) for i_episode in range(20): observation = env.reset() for t in range(100): env.render() print(observation) action = q_learning.greedy_policy(observation) observation, reward, done, info = env.step(action) if done: print "Episode finished after {} timesteps".format(t + 1) break
def run_tests(): """ Runs all tests defined in task2_tests.yaml. Returns: results: Results dataframe. """ with open(FILENAME) as file: # Loads testing parameters from the yaml file. tests = yaml.safe_load(file) # create a dataframe to keep the results test_dict = tests['Tests'] results = pd.DataFrame(test_dict) results['Last Average Score'] = "" results['No of Q-Learning episodes'] = "" # run experiments: for i, test in enumerate(test_dict): grid = Rooms(test["env_size"], testing=True) learning = QLearning(grid, test["gamma"], test["alpha"], test["agent_start_pos"]) e_greedy = Policy("e-greedy", test["epsilon"], test["decay"]) greedy = Policy(policy_type="greedy") experiment = Experiments(grid, learning, greedy, test["iters"], test["agent_start_pos"], test["test_no"]) for session in range(test["iters"]): learning.run_multiple_episodes(test["batch_episodes"], e_greedy) mean_reward = experiment.run_experiments(test["exp_per_batch"]) results.loc[i, 'Last Average Score'] = mean_reward results.loc[i, 'No of Q-Learning episodes'] = (session + 1) * test["batch_episodes"] # save results to csv file filename = 'results/' + 'test_table.csv' results.to_csv(filename) # plot & save graphs experiment.generate_results(test["test_no"], test) return results
def main(algorithm, track, x_start, y_start, discount, learning_rate, threshold, max_iterations, epsilon=None, reset_on_crash=False): """ Program entry. Runs selected algorithm on selected track, at given coordinates, with given parameters :param algorithm: String :param track: List :param x_start: Int :param y_start: Int :param discount: Float :param learning_rate: Float :param threshold: Float :param max_iterations: Int :param epsilon: Float :param reset_on_crash: Boolean :return: None """ with open(track) as f: specs = f.readline().strip().split(',') rows = int(specs[0]) cols = int(specs[1]) layout = f.read().splitlines() initial_state = (x_start, y_start, 0, 0) initial_action = (0, 0) agent = Car(initial_action, epsilon) environment = RaceTrack(rows, cols, layout, initial_state, reset_on_crash=reset_on_crash) if algorithm == 'value_iteration': value_iterator = ValueIteration(discount, threshold, max_iterations, environment, agent) value_iterator.run() path = value_iterator.extract_policy(initial_state) value_iterator.plot_max_diffs() elif algorithm == 'q_learning': q_learner = QLearning(discount, learning_rate, threshold, max_iterations, environment, agent) path = q_learner.run() q_learner.plot_avg_cost() elif algorithm == 'sarsa': sarsa = Sarsa(discount, learning_rate, threshold, max_iterations, environment, agent) path = sarsa.run() sarsa.plot_avg_cost() else: print("No algorithm selected") return None draw_track(path, layout)
def __init__(self): # Publish Q-matrix updates to Q-matrix topic self.q_matrix_pub = rospy.Publisher("/RobotFinalProject/QMatrix", QMatrix, queue_size=10) # Publish to /robot_action for training the Q-matrices self.q_learning = QLearning() self.robot_action_pub = rospy.Publisher( "/RobotFinalProject/robot_action", RobotTasksDoors, queue_size=10) # Subscribe to environment to receive reward updates self.reward = rospy.Subscriber("/RobotFinalProject/reward", QLearningReward, self.update_q_matrix) rospy.sleep(2) # Initialize Q-matrices, one for each robot self.q_matrix_msg_a = QMatrix() self.q_matrix_msg_b = QMatrix() # For ease of use in training, we'll store the Q-matrices in a numpy array self.q_matrix_arr_a = np.zeros((128, 7)) self.q_matrix_arr_b = np.zeros((128, 7)) # Keep track of the 5 most recently updated Q-values per Q-matrix self.q_history_a = [] self.q_history_b = [] # Keeps track of current (starting) state, initialized to 0th state. This state accounts for both robots self.current_state = 0 # Get new state and action from starting state new_state, action = self.get_random_action(self.current_state) # Current action is action from above (action from current state to new state) self.current_action = action # Keep track of new state self.new_state = new_state # Move the robot according to the current action self.complete_task_or_door(self.current_action)
def __init__(self, simulation_run, seed: int): logger.debug("Start initializing this: %s", self.__class__) # self.SimulationManager = simulation_run.SimulationManager sim_manager = simulation_run.SimulationManager self.market_size = sim_manager.marketSize self.competition_type = sim_manager.competition_type self.examination_interval_size = sim_manager.sizeOfExaminationInterval self.price = 0.0 self.quantity = 0.0 self.profit = 0.0 self.qlearning = QLearning(seed, sim_manager) # holds the firm's individual Q-learning algorithm self.get_other_firm = simulation_run.getOtherFirm # ArrayLists to store its parameters over the time self.prices = deque(maxlen=self.examination_interval_size) self.quantities = deque(maxlen=self.examination_interval_size) self.profits = deque(maxlen=self.examination_interval_size) self.cournot = Cournot() logger.debug("Init completed: %s", self.__class__)
def __init__(self): # Publish Q-matrix updates to Q-matrix topic self.q_matrix_pub = rospy.Publisher("/q_learning/QMatrix", QMatrix, queue_size=10) # Publish to /robot_action for phantom robot movement self.q_learning = QLearning() self.action_pub = rospy.Publisher("/q_learning/robot_action", RobotMoveDBToBlock, queue_size=10) # Subscribe to environment to receive reward updates self.reward = rospy.Subscriber("/q_learning/reward", QLearningReward, self.update_q_matrix) rospy.sleep(2) # Initialize Q-matrix self.q_matrix_msg = QMatrix() # For ease of use in training, we'll store the Q-matrix in a numpy array self.q_matrix_arr = np.zeros((64, 9)) # Keep track of the 5 most recently updated Q-values self.q_history = [] # Keeps track of current (starting) state, initialized to 0th state self.current_state = 0 # Get new state and action from starting state new_state, action = self.get_random_action(self.current_state) # Current action is action from above (action from current state to new state) self.current_action = action # Keep track of new state self.new_state = new_state # Move the robot according to the current action self.move_DB(self.current_action ) # update_q_matrix() will be called as callback
def __init__(self, total_episodes: int): self.window_width = constant.WIDTH * constant.TILE self.window_height = constant.HEIGHT * constant.TILE self._running = True self._display = None self._snake = None self._mouse = None self.episode = 1 self.total_episodes = total_episodes self.score = 0 self.max_score = 0 self.frames = 0 self.game_stats = [] self.specs = [] self.test_run = False self.snake = Snake() self.mouse = Mouse(constant.WIDTH, constant.HEIGHT, self.snake.body_position()) self.q = QLearning()
def run_q_learning_forest(S, r1, r2): forest = WrappedForest(S, r1, r2) n_episodes = 10000 how_often = n_episodes / 100 stats = IterationStats('stats/ql_forest.csv', dims=2) def on_episode(episode, time, q_learner, q): forest.print_policy(print, q_learner.get_policy()) stats.save_iteration(episode, time, numpy.nanmean(numpy.nanmax(q, axis=0)), q) def is_done(state, action, next_state): if next_state.state_num == 0: return True return False gamma = 0.99 start = time.time() numpy.random.seed(5263228) q_l = QLearning(forest, 0.5, 0.2, gamma, on_episode=on_episode, start_at_0=True, alpha=0.1, is_done=is_done, every_n_episode=how_often) stats.start_writing() q_l.run(n_episodes) stats.done_writing() forest.print_policy(print, q_l.get_policy()) print('took {} s'.format(time.time() - start)) stats = IterationStats('stats/ql_forest.csv', dims=2) analysis.create_iteration_value_graph( stats, 'average Q', 'Average Q for each iteration on Forest Q Learning', 'forest_results')
print(ob[0], ob[1]) num_bins = [10, 3] num_pos_actions = len(actions) print('num_pos_actions', num_pos_actions) # hyperparams: discount = 1.0 episodes = 100 epsilon = [0.5, 1.0, episodes // 2] # Epsilon start, start decay index, stop decay index lr = [0.5, 1.0, episodes // 2 ] # Learning rate start, start decay index, stop decay index q_learning = QLearning(env, num_bins, num_pos_actions, env_ranges, discount, episodes, epsilon, lr) print('q-table shape', q_learning.q_table.shape) obs = q_learning.reset_state() # Reset the environment and get the initial obs = [obs[i] for i in obs_to_use] print('\nInitial observation:', obs) action_to_maximise_q = q_learning.action_to_maximise_q( obs) # Find optimal action action = q_learning.decide_on_action( action_to_maximise_q) # Decide whether to use optimal or random action observation, reward_current, done = q_learning.perform_sim_step( action) # env.step(action) # Perform the first action NUM_TO_SHOW = 5
TRAINING_ON = True EPISODES = 1000 Q_MODEL_PATH = "outputs/keras-models/2048_q_model.h5" Q_MODEL_WEIGHTS_PATH = "outputs/keras-models/2048_q_model_weights.h5" T_MODEL_PATH = "outputs/keras-models/2048_t_model.h5" board_size = int(math.sqrt(env.observation_space.shape[0])) n_output = env.action_space.n # In[3]: QL = QLearning(n_x=board_size, n_y=n_output, q_save_path=Q_MODEL_PATH, q_weights_save_path=Q_MODEL_WEIGHTS_PATH, t_save_path=T_MODEL_PATH, total_episodes=EPISODES, restore_model=True, is_training_on=TRAINING_ON, T=10) # In[ ]: for episode in range(EPISODES): observation = env.reset() QL.curr_episode = episode while True: if RENDER_ENV: env.render() valid_move = False
def main(argv): # Get command line arguments try: opts, args = getopt.getopt(argv, "pi:") except getopt.GetoptError: print("main.py [-p] [-i=n_iter]") sys.exit(1) # Plot switch plot = False # Default iterations n_iter = 100000 # Parse command line arguments for opt, arg in opts: # Help if opt == "-h": print("main.py [-p] [-i=n_iter]") sys.exit() # Plot elif opt == "-p": plot = True # Number of iterations elif opt == "-i": n_iter = arg # Construct grid grid = construct_grid() # Find solution with value iteration print("Performing value iteration...") vi = ValueIteration(grid) vi = vi.solve() print("----------") print("The value for each cell is:") print("(x,y): value") for cell in grid: print(cell.name_x()+","+cell.name_y()+": "+\ str(vi[0][cell.get_name()])) print("----------") print("The policy found by value iteration is:") print("(x,y): action") for cell in grid: print(cell.name_x()+","+cell.name_y()+": "+\ str(vi[1][cell.get_name()])) # Find solution with Q-learning print("\n") print("Performing Q-learning...") ql = QLearning(grid, N_iter = int(n_iter)) ql = ql.solve() ql_states = ql[0] ql_Q = ql[1] print("----------") print("The policy found by Q-learning is.") print("(x,y): action") actions = ["north","east","south","west"] for cell in grid: ind = ql_states.index(cell.get_name()) action = actions[np.argmax(ql_Q[ind,])] print(cell.name_x()+","+cell.name_y()+": "+str(action)) if plot: # Convergence graph for q-learning: # best Q-value of the best action for the START state fig, ax = plt.subplots() ax.plot(ql[2][0:ql[3]], label = "Q-values for best action in START") ax.plot((0,ql[3]),(vi[0]["11"],vi[0]["11"]), label = "Values of START") plt.xlabel("Iterations") plt.show()
from q_learning import QLearning from SARSA import SARSALearning from eligibility_traces import EligibilityTraces from function_approximation import FApprox from mountain_cart import run_methods, self_iterate import pickle if __name__ == "__main__": # Initialize a method methods = [ QLearning("MountainCar-v0", print_progress=False), SARSALearning("MountainCar-v0", print_progress=False), FApprox("MountainCar-v0", print_progress=False), EligibilityTraces("MountainCar-v0", print_progress=False) ] # Run the tests run_methods(methods) method = methods[0] method.q_table = pickle.load( open("Best_Method_" + str(type(method).__name__) + ".p", "rb")) method.evaluate() method.display() self_iterate(methods[0])
# -*- coding: utf-8 -*- """ Created on Thu Apr 26 00:02:49 2018 @author: Shiratori """ import graphics as gfx import pong_model as pm from q_learning import QLearning if __name__ == '__main__': # Set up environment environment = pm.PongModel(0.5, 0.5, -0.03, 0.01, 0.4, paddleX=0) window = gfx.GFX(wall_x=1, player_x=0) window.fps = 9e16 # Set up model model = QLearning(environment, window, C=5e3, gamma=0.99, explore=-1, threshold=-1, log=True, log_file='q_test_log_(agent_B).txt', mode='test', q_table_file='q_q_table_(agent_B).csv') # Training model.train()
self.root_frame.q_learning_simulator = None pygame.display.quit() def draw_robot(self, i, j): top_left = (self.state_width * j, self.state_height * i) self.robot_rect = [ top_left[0], top_left[1], self.state_width, self.state_height ] self.screen.blit(self.robot, self.robot_rect) def draw_world(self): for i in range(len(self.world)): for j in range(len(self.world[i])): self.draw_rectangle(i, j, self.world[i][j]) def draw_rectangle(self, i, j, item): top_left = (self.state_width * j, self.state_height * i) color = self.item_colors[item] thickness = self.item_thickness[item] pygame.draw.rect( self.screen, color, (top_left[0], top_left[1], self.state_width, self.state_height), thickness) if __name__ == '__main__': file_name = "./board.txt" q_learning = QLearning("./board.txt", is_stochastic=True) QLearningSimulator(q_learning, delay_time=100)
def run(): """execute the TraCI control loop""" step = 0 # initialize QLearning num_phase = 2 max_num_car_stopped = 10 num_lane = 4 num_wait_time_category = 10 num_action = 10 q = QLearning(num_phase, max_num_car_stopped, num_lane, num_action) # we start with phase 2 where EW has green #traci.trafficlight.setPhase("0", 2) while traci.simulation.getMinExpectedNumber() > 0: traci.simulationStep() #next_action_idx = 9 # 現在の信号のフェーズ light_phase = traci.trafficlight.getPhase("0") # 現在のフェーズが黄色かつまだ次のアクションを決めていなかったら、次のフェーズの秒数を決める if (light_phase == 1 or light_phase == 3) and not q.is_calculate_next_action: q.is_set_duration = False # 次に信号が取るフェーズを取得 next_light_phase = 0 if light_phase == 1: next_light_phase = 2 # それぞれのレーンで停まっている車の数 count_0 = min(traci.lanearea.getLastStepHaltingNumber("0"), 9) count_1 = min(traci.lanearea.getLastStepHaltingNumber("1"), 9) count_2 = min(traci.lanearea.getLastStepHaltingNumber("2"), 9) count_3 = min(traci.lanearea.getLastStepHaltingNumber("3"), 9) # 次の信号のフェーズと現在の混雑状況 current_state_dict = { 'light_phase': next_light_phase, 'nums_car_stopped': [count_0, count_1, count_2, count_3] } current_digitized_state = q.digitize_state(current_state_dict) q.next_action_idx = q.get_action(current_digitized_state) q.is_calculate_next_action = True # reward reward = -np.sum( [x**1.5 for x in [count_0, count_1, count_2, count_3]]) q.rewards.append(reward) # 各青赤フェーズが終了したタイミングで、以前の状況に対してとったアクションに対するリワードを計算するため、このタイミングで、前回のstateとactionに対するリワードを計算する? q.update_Qtable(q.previous_digitized_state, q.previous_action, reward, current_digitized_state) q.previous_digitized_state = current_digitized_state q.previous_action_idx = q.next_action_idx # 現在のフェーズが0か2でかつまだ秒数をセットしていなかったら、秒数をセットする if (light_phase == 0 or light_phase == 2) and not q.is_set_duration: traci.trafficlight.setPhaseDuration("0", q.action[q.next_action_idx]) q.is_set_duration = True q.is_calculate_next_action = False print("set phase {} for {} seconds".format( light_phase, q.action[q.next_action_idx])) step += 1 if step % 10000 == 0: plot_graph(q.rewards) traci.close() sys.stdout.flush()
state_, reward, done = env.step(action) step_count += 1 # 增加步数 # 机器人大脑从这个过渡(transition) (state, action, reward, state_) 中学习 RL.learn(str(state), action, reward, str(state_)) # 机器人移动到下一个 state state = state_ # 如果踩到炸弹或者找到宝藏, 这回合就结束了 if done: print("回合 {} 结束. 总步数 : {}\n".format(episode + 1, step_count)) break # 结束游戏并关闭窗口 print('游戏结束') env.destroy() if __name__ == "__main__": # 创建环境 env 和 RL env = Maze() RL = QLearning(actions=list(range(env.n_actions))) # 开始可视化环境 env.after(100, update) env.mainloop() print('\nQ 表:') print(RL.q_table)
from q_learning import QLearning from SARSA import SARSALearning from ElgibilityTraces import ElgibilityTraces from function_approximation import FApprox import matplotlib.pyplot as plt import numpy as np if __name__ == "__main__": # Initialize a method methods = [ QLearning("MountainCar-v0"), SARSALearning("MountainCar-v0"), FApprox("MountainCar-v0"), ElgibilityTraces("MountainCar-v0") ] method = methods[3] method.train(1000) # Test the method method.evaluate() method.plot() #for method in methods: # graph = np.zeros(method.env._max_episode_steps / 10) ## runs = 30 # for run in range(1, runs + 1): # method.reset_model() # method.train(1000) # graph += method.convergence_graph / runs
param_default_reward = 0 param_agent_loc = (0, 0) param_alpha = 0.1 param_gamma = 0.99 param_epsilon = 0.9 hyper_n_episodes = 100 env = Env((param_grid_size, param_grid_size), param_agent_loc, (param_block_size, param_block_size), param_default_reward) env.set_block((6, 6), param_default_reward, 100, True) env.set_block((6, 5), param_default_reward, -100, True) #env.set_block((6, 8), 100, 0, False) #env.set_block((8, 8), 10, 0, False) #env.set_block((6, 7), 10, 0, False) q = QLearning(param_alpha, param_gamma, param_epsilon, Env.N_ACTIONS) for episode in range(hyper_n_episodes): env.reset() is_terminal = False state = 0 action = q.episode_start(state) while is_terminal == False: state, action, reward, state_next, is_terminal = env.step(action) print("agent from state {} --> state {}, take action {}".format( state, state_next, action)) if is_terminal == True: print("episode {}: agent reached terminal".format(episode)) action = q.step(state, action, reward, state_next) # show current state values action_values = q.action_values(state)
env_ranges = [(-0.5, 0.5), (0, 1), (-0.5, 0.5), (-0.5, 0.5), (-0.4, 0.4), (-0.4, 0.4), (-1, 1), (-1, 1)] print('\nObservation ranges:') for ob in enumerate(env_ranges): print(ob[0], ob[1]) num_bins = [3, 20, 3, 6, 6, 6, 3, 3] num_pos_actions = len(actions) q_learning = QLearning(env=env, num_bins=num_bins, num_pos_actions=num_pos_actions, env_ranges=env_ranges, discount=0, episodes=0, epsilon=None, lr=None, USE=True) env = gym.make('LunarLander-v2') q_learning.q_table = np.load('./data_lunarlander/0_9000.npy') for _ in range(10): obs = q_learning.reset_state() # Reset the environment and get the initial done = False while not done:
time.sleep(1) class GraphHolder: def __init__(self): self.graph = np.zeros(50) self.number = 100 threadList = [] nameList = [] graph = GraphHolder() for i in range(graph.number): threadList.append("Thread-" + str(i)) nameList.append(QLearning("MountainCar-v0")) queueLock = threading.Lock() workQueue = queue.Queue() threads = [] threadID = 1 # Create new threads for tName in threadList: thread = myThread(threadID, tName, workQueue, graph) thread.start() threads.append(thread) threadID += 1 # Fill the queue queueLock.acquire()
def run_q_learning_grid_world(): world = GridWorld('simple_grid.txt', -0.01, include_treasure=False) n_episodes = 500000 how_often = n_episodes / 500 stats = IterationStats('stats/ql_simple_grid.csv', dims=5) def on_update(state, action, next_state, q_learner): #print('[{},{}] - {} -> [{},{}]'.format(state.x, state.y, action[0], next_state.x, next_state.y)) pass def on_episode(episode, time, q_learner, q): world.print_policy(print, q_learner.get_policy()) stats.save_iteration(episode, time, numpy.nanmean(numpy.nanmax(q, axis=0)), q) #time.sleep(1) for state in world.get_states(): if state.tile_type == GridWorldTile.GOAL: goal_state = state break def initialize_toward_goal(state: GridWorldTile): actions = state.get_actions() if len(actions) == 0: return [] diff_x = goal_state.x - state.x diff_y = goal_state.y - state.y best_value = 0.1 if len(actions) == 5 and actions[4][0].startswith('get treasure'): best_action = actions[4][0] elif abs(diff_x) >= abs(diff_y): if diff_x > 0: best_action = 'move east' else: best_action = 'move west' else: if diff_y < 0: best_action = 'move north' else: best_action = 'move south' values = [-0.1] * len(actions) for i, action in enumerate(actions): if action[0] == best_action: values[i] = best_value return values gamma = 0.99 q_l = QLearning(world, 0.5, 0.05, gamma, on_update=on_update, on_episode=on_episode, initializer=initialize_toward_goal, start_at_0=True, alpha=0.1, every_n_episode=how_often) stats.start_writing() q_l.run(n_episodes) stats.done_writing() world.print_policy(print, q_l.get_policy())
import gym import numpy as np import torch import sys from matplotlib import pyplot as plt sys.path.insert(0, "../../agents") from q_learning import QLearning # pylint: disable=import-error EPISODES = 5000 if __name__ == "__main__": env = gym.make("FrozenLake-v0") agent = QLearning(env, epsilon=0.8, gamma=0.5, lr=0.01) episode_rew = [] for episode in range(EPISODES): # Deciding first action action = env.action_space.sample() state = env.reset() ep_rew = 0 while True: next_state, reward, done, _ = env.step(action) # env.render() ep_rew += reward agent.update((state, action, reward, next_state)) state = next_state agent.get_action(state)