def play(params, task, max_time): from reward_machines.reward_machine import RewardMachine # commands str_to_action = { "w": Actions.up.value, "d": Actions.right.value, "s": Actions.down.value, "a": Actions.left.value } # play the game! game = CraftWorld(params) rm = RewardMachine(task) s1 = game.get_state() u1 = rm.get_initial_state() for t in range(max_time): # Showing game game.show_map() print("Events:", game.get_true_propositions()) print("Features:", game.get_features()) print("Features.shape:", game.get_features().shape) print("Features.manhattan_distance:", game._get_features_manhattan_distance()) acts = game.get_actions() # Getting action print("\nAction? ", end="") a = input() print() # Executing action if a in str_to_action and str_to_action[a] in acts: game.execute_action(str_to_action[a]) s2 = game.get_state() events = game.get_true_propositions() u2 = rm.get_next_state(u1, events) reward = rm.get_reward(u1, u2, s1, a, s2) if game.env_game_over or rm.is_terminal_state(u2): # Game Over print("Game Over") break s1, u1 = s2, u2 else: print("Forbidden action") game.show_map() return reward
def load_model_and_test_composition(alg_name, tester, curriculum, num_times, new_task, show_print): """ Testing a single task (see run_new_task.py) TODO: refactor with get_qrm_generalization_performance """ for n in range(num_times): random.seed(n) sess = tf.Session() curriculum.restart() # Initialize a policy_bank graph to be loaded with saved model task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) policy_bank = PolicyBankDQN(sess, num_actions, num_features, tester.learning_params, tester.get_reward_machines()) # Load the model saver = tf.train.Saver() # Get path if task_aux.params.game_type == "craftworld": save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + task_aux.game.get_map_id() else: save_model_path = '../model/' + str(task_aux.params.game_type) saver.restore(sess, tf.train.latest_checkpoint(save_model_path)) reward_machines = tester.get_reward_machines() print("Loaded {} policies (RMs)".format(len(reward_machines))) # partial-ordered RM of new task new_task_rm = RewardMachine(new_task.rm_file) linearized_plans = new_task.get_linearized_plan() print("There are {} possible linearized plans: {}".format( len(linearized_plans), linearized_plans)) least_cost = float('inf') best_policy = [ ] # list of (rm_id, state_id) corresponding to each action for i, curr_plan in enumerate(linearized_plans): # Get the least cost path for the current linearized plan # cost, switching_seq = search_policy(curr_plan, tester, curriculum, new_task_rm, reward_machines, # policy_bank, bound=least_cost) cost, switching_seq = dfs_search_policy(curr_plan, tester, curriculum, new_task_rm, reward_machines, policy_bank, bound=least_cost) if cost < least_cost: print(cost, switching_seq) least_cost = cost best_policy = switching_seq # Execute the best policy print("Executing Best Policy...{} ({} steps)".format( best_policy, least_cost)) task = Game(tester.get_task_params(curriculum.get_current_task())) new_task_u1 = new_task_rm.get_initial_state() s1, s1_features = task.get_state_and_features() r_total = 0 curr_policy = None for t in range(int(least_cost)): if show_print: task.render() if curr_policy is None: curr_policy = best_policy.pop(0) curr_policy_rm = reward_machines[curr_policy[0]] a = policy_bank.get_best_action(curr_policy[0], curr_policy[1], s1_features.reshape( (1, num_features)), add_noise=False) if show_print: print("Action:", Actions(a)) task.execute_action(a) s2, s2_features = task.get_state_and_features() new_task_u2 = new_task_rm.get_next_state( new_task_u1, task.get_true_propositions()) curr_policy_u2 = curr_policy_rm.get_next_state( curr_policy[1], task.get_true_propositions()) desired_next_state = curr_policy_rm.get_next_state( curr_policy[1], curr_policy[2]) if curr_policy_u2 == desired_next_state: logger.info("EXECUTED ACTION {}, SWITCHING POLICIES".format( curr_policy[2])) curr_policy = None r = new_task_rm.get_reward(new_task_u1, new_task_u2, s1, a, s2) r_total += r * tester.learning_params.gamma**t s1, s1_features = s2, s2_features new_task_u1 = new_task_u2 if show_print: task.render() print("Rewards:", r_total) return r_total
def get_qrm_generalization_performance(alg_name, tester, curriculum, num_times, new_tasks, show_print): """ Testing all the tasks in new_tasks and return the success rate and cumulative reward """ sess = tf.Session() curriculum.restart() # Initialize a policy_bank graph to be loaded with saved model task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) policy_bank = PolicyBankDQN(sess, num_actions, num_features, tester.learning_params, tester.get_reward_machines()) # Load the model saver = tf.train.Saver() # Get path if task_aux.params.game_type == "craftworld": save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + task_aux.game.get_map_id() else: save_model_path = '../model/' + str(task_aux.params.game_type) saver.restore(sess, tf.train.latest_checkpoint(save_model_path)) reward_machines = tester.get_reward_machines() print("Loaded {} policies (RMs)".format(len(reward_machines))) success_count = 0 all_task_rewards = [] for new_task in new_tasks: # partial-ordered RM of new task new_task_rm = RewardMachine(new_task.rm_file) linearized_plans = new_task.get_linearized_plan() print("There are {} possible linearized plans: {}".format( len(linearized_plans), linearized_plans)) least_cost = float('inf') best_policy = [ ] # list of (rm_id, state_id) corresponding to each action for i, curr_plan in enumerate(linearized_plans): # Get the least cost path for the current linearized plan cost, switching_seq = dfs_search_policy(curr_plan, tester, curriculum, new_task_rm, reward_machines, policy_bank, bound=least_cost) if cost < least_cost: print(cost, switching_seq) least_cost = cost best_policy = switching_seq # finding optimal takes too long, end early if find a solution break # Couldn't solve the task if least_cost == np.inf: print("Failed to execute this task: {}".format(new_task)) r_total = 0.0 all_task_rewards.append(r_total) continue # Execute the best policy print("Executing Best Policy...{} ({} steps)".format( best_policy, least_cost)) task = Game(tester.get_task_params(curriculum.get_current_task())) new_task_u1 = new_task_rm.get_initial_state() s1, s1_features = task.get_state_and_features() r_total = 0 curr_policy = None for t in range(int(least_cost)): if show_print: task.render() if curr_policy is None: curr_policy = best_policy.pop(0) curr_policy_rm = reward_machines[curr_policy[0]] a = policy_bank.get_best_action(curr_policy[0], curr_policy[1], s1_features.reshape( (1, num_features)), add_noise=False) task.execute_action(a) s2, s2_features = task.get_state_and_features() new_task_u2 = new_task_rm.get_next_state( new_task_u1, task.get_true_propositions()) curr_policy_u2 = curr_policy_rm.get_next_state( curr_policy[1], task.get_true_propositions()) desired_next_state = curr_policy_rm.get_next_state( curr_policy[1], curr_policy[2]) if curr_policy_u2 == desired_next_state: logger.info("EXECUTED ACTION {}, SWITCHING POLICIES".format( curr_policy[2])) curr_policy = None r = new_task_rm.get_reward(new_task_u1, new_task_u2, s1, a, s2) r_total += r * tester.learning_params.gamma**t s1, s1_features = s2, s2_features new_task_u1 = new_task_u2 if show_print: task.render() print("Rewards:", r_total) all_task_rewards.append(r_total) if r_total > 0: success_count += 1 success_rate = float(success_count) / len(new_tasks) acc_reward = sum(all_task_rewards) print(all_task_rewards) return success_rate, acc_reward
def play(): import pygame, time from reward_machines.reward_machine import RewardMachine from tester.tester import Tester from tester.tester_params import TestingParameters from qrm.learning_params import LearningParameters # hack: moving one directory up (to keep relative references to ./src) import os os.chdir("../") tester = Tester(LearningParameters(), TestingParameters(), "../experiments/water/tests/water_7.txt") if tester is None: task = "../experiments/water/reward_machines/t1.txt" state_file = "../experiments/water/maps/world_0.pkl" max_x = 400 max_y = 400 b_num_per_color = 2 b_radius = 15 use_velocities = True ball_disappear = False params = WaterWorldParams(state_file, b_radius=b_radius, max_x=max_x, max_y=max_y, b_num_per_color=b_num_per_color, use_velocities = use_velocities, ball_disappear=ball_disappear) else: task = tester.get_task_rms()[-2] params = tester.get_task_params(task).game_params max_x, max_y = params.max_x, params.max_y game = WaterWorld(params) rm = RewardMachine(task) s1 = game.get_state() u1 = rm.get_initial_state() print("actions", game.get_actions()) pygame.init() black = (0,0,0) white = (255,255,255) colors = get_colors() gameDisplay = pygame.display.set_mode((max_x, max_y)) pygame.display.set_caption('Water world :)') clock = pygame.time.Clock() crashed = False t_previous = time.time() actions = set() while not crashed: for event in pygame.event.get(): if event.type == pygame.QUIT: crashed = True if event.type == pygame.KEYUP: if Actions.left in actions and event.key == pygame.K_LEFT: actions.remove(Actions.left) if Actions.right in actions and event.key == pygame.K_RIGHT: actions.remove(Actions.right) if Actions.up in actions and event.key == pygame.K_UP: actions.remove(Actions.up) if Actions.down in actions and event.key == pygame.K_DOWN: actions.remove(Actions.down) if event.type == pygame.KEYDOWN: if event.key == pygame.K_LEFT: actions.add(Actions.left) if event.key == pygame.K_RIGHT: actions.add(Actions.right) if event.key == pygame.K_UP: actions.add(Actions.up) if event.key == pygame.K_DOWN: actions.add(Actions.down) t_current = time.time() t_delta = (t_current - t_previous) # Getting the action if len(actions) == 0: a = Actions.none else: a = random.choice(list(actions)) # Executing the action game.execute_action(a.value, t_delta) s2 = game.get_state() events = game.get_true_propositions() u2 = rm.get_next_state(u1, events) reward = rm.get_reward(u1,u2,s1,a,s2) # printing image gameDisplay.fill(white) for b in game.balls: draw_ball(b, colors, 0, gameDisplay, pygame, max_y) draw_ball(game.agent, colors, 3, gameDisplay, pygame, max_y) pygame.display.update() clock.tick(20) # print info related to the task if reward > 0: print("REWARD!! ----------------!------------!") if rm.is_terminal_state(u2): print("Machine state:", u2, "(terminal)") else: print("Machine state:", u2) t_previous = t_current s1, u1 = s2, u2 pygame.quit()
def play(): from tester.tester import Tester from tester.tester_params import TestingParameters from qrm.learning_params import LearningParameters from reward_machines.reward_machine import RewardMachine import os os.chdir("../") tester = Tester(LearningParameters(), TestingParameters(), "../experiments/mouse/tests/mouse_0.txt") task = tester.get_task_rms()[1] params = tester.get_task_params(task).game_params max_x = params.max_x max_y = params.max_y game = MouseWorld(params) rm = RewardMachine(task) s1 = game.get_state() u1 = rm.get_initial_state() pygame.init() gameDisplay = pygame.display.set_mode((max_x, max_y)) pygame.display.set_caption('Fake Keyboard') clock = pygame.time.Clock() crashed = False t_previous = time.time() actions = set() while not crashed: for event in pygame.event.get(): if event.type == pygame.QUIT: crashed = True if event.type == pygame.KEYUP: if Actions.left in actions and event.key == pygame.K_LEFT: actions.remove(Actions.left) if Actions.right in actions and event.key == pygame.K_RIGHT: actions.remove(Actions.right) if Actions.up in actions and event.key == pygame.K_UP: actions.remove(Actions.up) if Actions.down in actions and event.key == pygame.K_DOWN: actions.remove(Actions.down) if Actions.jump in actions and event.key == pygame.K_SPACE: actions.remove(Actions.jump) if event.type == pygame.KEYDOWN: if event.key == pygame.K_LEFT: actions.add(Actions.left) if event.key == pygame.K_RIGHT: actions.add(Actions.right) if event.key == pygame.K_UP: actions.add(Actions.up) if event.key == pygame.K_DOWN: actions.add(Actions.down) if event.key == pygame.K_SPACE: actions.add(Actions.jump) t_current = time.time() t_delta = (t_current - t_previous) if len(actions) == 0: a = Actions.none else: a = random.choice(list(actions)) # Executing the action game.execute_action(a.value, t_delta) s2 = game.get_state() events = game.get_true_propositions() u2 = rm.get_next_state(u1, events) reward = rm.get_reward(u1, u2, s1, a, s2) if reward > 0: print("REWARD ", reward) if rm.is_terminal_state(u2): print("Machine state:", u2, "(terminal)") else: print("Machine state:", u2) # Printing Image gameDisplay.fill(Colors.WHITE.value) for k in game.keyboard_keys: k.draw_on_display(gameDisplay) game.agent.draw_on_display(gameDisplay) game.draw_current_text_on_display(gameDisplay) pygame.display.update() clock.tick(20) t_previous = t_current s1, u1 = s2, u2 pygame.quit()