def test(self, q_values): # load environment loader = EnvironmentLoader("environments/maps") env = loader.load_map(self.map) # set up agent agent = QLearningAgent(alpha=0, epsilon_policy=self.epsilon_policy, discount=0, action_space=env.action_space, state_space=env.state_space, q_values=q_values) # reset everything agent.reset() env.reset() # generate epsilon values (runs infinitely long) for epsilon in self.epsilon_policy: # set epsilon for current epoch agent.epsilon = epsilon env.reset_position() done = False pygame.display.set_caption(f'KI-Labor GridWorld - Test') while not done: env.renderer.update_info(self.hyperparameters, epsilon) event = self.event_occured( timeout_ms=RenderSettings.TIME_BETWEEN_FRAMES, renderer=env.renderer) env.render(q_values) done = self.step(agent, env) if done and event is None: event = self.event_occured( timeout_ms=RenderSettings.TIME_BETWEEN_FRAMES, renderer=env.renderer) env.render(q_values, True) # handle events if event == "reset": return elif event == "skip": break elif event == "pause": while self.event_occured(renderer=env.renderer) != "pause": continue
def main(): # Set log level. logging.basicConfig(level=logging.DEBUG) # Set a random seed for consistency in agent AND environment. if config.RANDOM_SEED is not None: np.random.seed(config.RANDOM_SEED) # Make environment. env = EnvCatcher(grid_size=config.GRID_SIZE, env_type='episodic', verbose=False, random_seed=config.RANDOM_SEED) # Make agent. agent = QLearningAgent(actions=list(range(env.action_space)), learning_rate=config.LEARNING_RATE, discount_factor=config.DISCOUNT_FACTOR, epsilon=config.EPSILON) # Run the RL Loop. episode_returns = rl.run_loop(agent=agent, env=env, max_num_steps=config.MAX_NUM_STEPS, report_every_n=config.REPORT_EVERY_N) # Save the data. date_string = time.strftime("%Y%m%d-%H%M%S") filename = ('qlearn_grid_{}_nep_{}_'.format( config.GRID_SIZE, len(episode_returns)) + date_string + '.csv') rl.save_episode_returns(filename=filename, episode_returns=episode_returns)
def generate_graph(loop_no=None): wall_1, wall_2, key_1, key_2, door_1, door_2 = generate_room(loop_no) G = nx.DiGraph() for i in range(100): if i < 50: rand_x = np.random.randint(1, 3) rand_y = np.random.randint(5, 7) start = (rand_x, rand_y) env_name = 'gym_minigrid.envs:MiniGrid-MainRoom-v1' env = gym.make(env_name, agent_start=start, key_pos=[key_1, key_2], wall_pos=[wall_1, wall_2], door_pos=[door_1, door_2]) else: start_pos = (6, 2) if key_2 != (6, 1): start_pos = (6, 1) env = gym.make('MiniGrid-MainRoom-v1', agent_start=start_pos, keys_carried=[1, 0], door_state=[1, 0], key_pos=[key_1, key_2], wall_pos=[wall_1, wall_2], door_pos=[door_1, door_2]) number_of_actions = env.action_space.n agent = QLearningAgent(number_of_actions, gamma=0.9, alpha=0.12, epsilon=0.1) state = env.reset() for i in range(1000): action = np.random.randint(0, number_of_actions) next_state, reward, done, _ = env.step(action) agent.update(state, action, next_state, reward) if done: break G.add_edge(state, next_state, action=action) state = next_state return G, number_of_actions
def test_cube(max_episode, max_step): env = TreasureCube(max_step=max_step) agent = QLearningAgent() episode_rewards = [] for epsisode_num in range(0, max_episode): state = env.reset() terminate = False t = 0 episode_reward = 0 while not terminate: action = agent.take_action(state) reward, terminate, next_state = env.step(action) episode_reward += reward # you can comment the following two lines, if the output is too much # env.render() # comment # print(f'step: {t}, action: {action}, reward: {reward}') # comment t += 1 agent.train(state, action, next_state, reward) state = next_state print( f'episode: {epsisode_num}, total_steps: {t} episode reward: {episode_reward}' ) episode_rewards.append(episode_reward) print(agent.getQTable()) return showPlot(list(range(max_episode)), episode_rewards, 'episode', 'episode rewards')
def run_q_learning(env, num_episodes, gamma, alpha, epsilon): agent = QLearningAgent(env.action_space.n, gamma=gamma, alpha=alpha, epsilon=epsilon) stats = { 'episode_lengths': np.zeros(num_episodes), 'episode_rewards': np.zeros(num_episodes) } for i_episode in range(num_episodes): if (i_episode + 1) % 20 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="") sys.stdout.flush() state = env.reset() done = False t = 0 while not done: action = agent.step(state) next_state, reward, done, _ = env.step(action) # Update statistics stats['episode_rewards'][i_episode] += reward stats['episode_lengths'][i_episode] = t agent.update(state, action, next_state, reward) t += 1 state = next_state print() return agent, stats
def setUp(self): # Set a random seed for consistency in agent AND environment. if RANDOM_SEED is not None: np.random.seed(RANDOM_SEED) # Make environment. self.env = EnvCatcher(grid_size=GRID_SIZE, env_type='episodic', verbose=False, random_seed=RANDOM_SEED) # Make agent. self.agent = QLearningAgent(actions=list(range(self.env.action_space)), learning_rate=LEARNING_RATE, discount_factor=DISCOUNT_FACTOR, epsilon=EPSILON)
def train(self): # load environment loader = EnvironmentLoader("environments/maps") env = loader.load_map(self.map) # set up agent agent = QLearningAgent(self.hyperparameters["alpha"], self.epsilon_policy, self.hyperparameters["discount"], env.action_space, env.state_space) # reset everything epoch_counter = 0 agent.reset() env.reset() # generate epsilon values (runs infinitely long) for epsilon in self.epsilon_policy: # set epsilon for current epoch agent.epsilon = epsilon env.reset_position() done = False while not done: render_current_epoch = RenderSettings.ENABLED and epoch_counter % RenderSettings.INTERVAL == 0 save_current_epoch = SaveSettings.ENABLED and epoch_counter % SaveSettings.INTERVAL == 0 if epoch_counter % RenderSettings.UPDATE_FREQ_TITLE == 0 or render_current_epoch: pygame.display.set_caption( f'KI-Labor GridWorld - Epoch {epoch_counter}' ) # takes around 0.1ms on average env.renderer.update_info(self.hyperparameters, epsilon) if render_current_epoch: event = self.event_occured( timeout_ms=RenderSettings.TIME_BETWEEN_FRAMES, renderer=env.renderer) else: event = self.event_occured(renderer=env.renderer) if render_current_epoch: env.render(agent.get_q__values()) if save_current_epoch and self.save_name is not None: AgentManager.save_agent_state( agent, f"{SaveSettings.SAVE_PATH}/{self.save_name}_{epoch_counter}.txt" ) done = self.step(agent, env) if done and render_current_epoch and event is None: event = self.event_occured( timeout_ms=RenderSettings.TIME_BETWEEN_FRAMES, renderer=env.renderer) env.render(agent.get_q__values(), True) # handle events if event == "reset": return elif event == "skip": break elif event == "pause": while self.event_occured(renderer=env.renderer) != "pause": continue epoch_counter += 1
def run_q_learning(num_episodes, max_eps_length, env, dom_no, loop_no, run_no, with_options=False, factored=False): if with_options: print('with skills') if factored: skills_file = open( RESULTS_PATH + 'generated_options/' + str(dom_no) + '/' + str(loop_no) + '/' + str(run_no) + '_factored_skills.pickle', "rb") else: skills_file = open( RESULTS_PATH + 'generated_options/' + str(dom_no) + '/generated_skills.pickle', "rb") skills = pickle.load(skills_file) number_of_actions = env.action_space.n + len(skills) else: number_of_actions = env.action_space.n agent = QLearningAgent(number_of_actions, gamma=0.9, alpha=0.12, epsilon=0.1) stats = EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) input_list = [] output_list = [] for i_episode in range(num_episodes): if (i_episode + 1) % 20 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="") sys.stdout.flush() state = env.reset() for t in range(max_eps_length): action = agent.step(state) if action >= env.action_space.n: input = state + (action - env.action_space.n, ) input_list.append(list(input)) option = skills[action - env.action_space.n] if factored: state_in = option.factor_state(state, global_set) else: state_in = state if option.in_initialisation_set(state_in): next_state, reward, done, _, total_steps = execute_option( 25, env, option, state_in, factored=factored, state_uf=state) stats.episode_lengths[i_episode] = t + total_steps - 1 else: next_state = state reward = -1 done = False output_list.append(list(next_state)) else: next_state, reward, done, _ = env.step(action) stats.episode_lengths[i_episode] = t # Update statistics stats.episode_rewards[i_episode] += reward agent.update(state, action, next_state, reward) if done: break state = next_state cols = global_set df_out = pd.DataFrame.from_records(output_list, columns=cols) cols.append('Action') df_in = pd.DataFrame.from_records(input_list, columns=cols) data_path = './data/' + str(loop_no) if not (os.path.isdir(data_path)): os.makedirs(data_path) if run_no > 0: print("SHOULDNT B HERE") df_in2 = pd.read_csv(data_path + '/input_data_task_' + str(run_no - 1) + '.csv', index_col=False) df_out2 = pd.read_csv(data_path + '/output_data_task_' + str(run_no - 1) + '.csv', index_col=False) df_in = df_in.append(df_in2, ignore_index=True, sort=False) df_out = df_out.append(df_out2, ignore_index=True, sort=False) df_in.drop(df_in.filter(regex='Unname'), axis=1, inplace=True) df_out.drop(df_out.filter(regex='Unname'), axis=1, inplace=True) df_in.to_csv(data_path + '/input_data_task_' + str(run_no) + '.csv') df_out.to_csv(data_path + '/output_data_task_' + str(run_no) + '.csv') return agent, stats
def run_config(self, configuration): env, problem_size, num_models, num_episodes, num_steps, policy = configuration policies = PolicyCollection.get_batch(policy) # create a new graph graph = tf.Graph() with graph.as_default(): # and a configuration as well. tf_config = tf.ConfigProto(log_device_placement=True) tf_config.intra_op_parallelism_threads = 8 tf_config.inter_op_parallelism_threads = 8 tf_config.gpu_options.allow_growth = True with tf.Session(graph=graph, config=tf_config) as sess: env = env("test", [num_models], problem_size) state_space = env.state_space action_space = env.action_space # --------------------- Determine the optimal reward -------------------- # Determine the agent count num_policies = len(policies) optimal_ih_rew, minimal_ih_rew, min_q, max_q, _ = env.get_optimal(num_steps, 0.99) # -------------------------------------------------------------------------- # Iterate over all policies and create an agent using that specific policy agents = list() environments = list() densities = list() get_best_shared = list() shared_steps = list() for pol_num in range(num_policies): # Get policies and unique name pe = policies[pol_num] unique_name = str(pol_num) # extract important fields policy = pe[1] policy_config = pe[2] policy_config['num_models'] = num_models policy_config['min_q'] = min_q policy_config['max_q'] = max_q policy_config['action_space'] = action_space current_env = env.clone(unique_name) environments.append(current_env) agent = QLearningAgent(sess, unique_name, current_env, policy, policy_config) agents.append(agent) shared_steps.append(policy_config['shared_steps']) densities.append([agent.ref_complete_densities]) if 'shared_learning' in policy_config and policy_config['shared_learning']: get_best_shared.append(agent.get_best_heads) # init variables init = tf.global_variables_initializer() sess.run(init) feed_dict = {} for agent in agents: feed_dict[agent.use_best] = True # retrieve the learn operations update_and_receive_rewards = [agent.q_tensor_update for agent in agents] reset_ops = [envs.reset_op for envs in environments] cum_rew_ops = [envs.cum_rewards for envs in environments] # create trainings rewards tr_rewards = np.zeros((num_episodes, num_policies, num_models)) # iterate over episodes for episode in range(num_episodes): # reset all environments sess.run(reset_ops) # for each agent sample a new head state_dict = {} for k in range(num_policies): agents[k].sample_head() state_dict[agents[k].use_best] = False # repeat this for the number of steps for k in range(num_steps): for m in range(num_policies): if shared_steps[k] > 0 and k % shared_steps[k] == 0: sess.run(agents[k].get_best_heads) # receive rewards and add sess.run(update_and_receive_rewards, feed_dict=state_dict) # copy values tr_rewards[episode, :, :] = sess.run(cum_rew_ops) return tr_rewards
# Get policies and unique name pe = policies[pol_num] unique_name = str(pol_num) # extract important fields policy = pe[1] policy_config = pe[2] policy_config['num_models'] = num_models policy_config['min_q'] = min_q policy_config['max_q'] = max_q policy_config['action_space'] = action_space current_env = env.clone(unique_name) environments.append(current_env) agent = QLearningAgent(sess, unique_name, current_env, policy, policy_config) agents.append(agent) if plot_models > 0 and pol_num in record_indices: # setup densities if 'pseudo_count_type' in policy_config and policy_config[ 'pseudo_count_type']: num_densities = 2 densities.append([ agent.cb_complete_densities, agent.ref_complete_densities ]) else: num_densities = 1