def main(local_dir): """Main loop based on `rllib.examples.saving_experiences`.""" # pylint: disable=too-many-locals batch_builder = SampleBatchBuilder() writer = JsonWriter(local_dir) env = _industrial_benchmark_maker({"max_episode_steps": 1000}) policy = IBBehaviorPolicy(env.observation_space, env.action_space, {}) for eps_id in trange(100): obs = env.reset() prev_action = np.zeros_like(env.action_space.sample()) prev_reward = 0 done = False time = 0 while not done: action, _, _ = policy.compute_single_action(obs, []) new_obs, rew, done, info = env.step(action) batch_builder.add_values( t=time, eps_id=eps_id, agent_index=0, obs=obs, actions=action, action_prob=1.0, # put the true action probability here rewards=rew, prev_actions=prev_action, prev_rewards=prev_reward, dones=done, infos=info, new_obs=new_obs, ) obs = new_obs prev_action = action prev_reward = rew time += 1 writer.write(batch_builder.build_and_reset())
def action_space_sample(self): dist = np.random.randint(10) change_topics = np.random.choice(N_TOPICS,dist,replace=False) action = self.observation[len(CONTEXT_ATTRIBUTES):].copy() for i in change_topics: action[i] = 0 if action[i] else 1 return action if __name__ == "__main__": args = parser.parse_args() batch_builder = SampleBatchBuilder() output_path = os.path.join(ray.utils.get_user_temp_dir(), "demo-out") writer = JsonWriter(output_path) print("OUTPUT IN {}".format(output_path)) env = NewsWorld() prep = get_preprocessor(env.observation_space)(env.observation_space) print("The preprocessor is", prep) for eps_id in range(args.stop): obs = env.reset() prev_action = np.zeros_like(env.action_space_sample()) prev_reward = 0 done = False t = 0 while not done: action = env.action_space_sample()
import gym import numpy as np from ray.rllib.models.preprocessors import get_preprocessor from ray.rllib.evaluation.sample_batch_builder import SampleBatchBuilder from ray.rllib.offline.json_writer import JsonWriter if __name__ == "__main__": batch_builder = SampleBatchBuilder() # or MultiAgentSampleBatchBuilder writer = JsonWriter("/tmp/demo-out") # You normally wouldn't want to manually create sample batches if a # simulator is available, but let's do it anyways for example purposes: env = gym.make("Acrobot-v1") # RLlib uses preprocessors to implement transforms such as one-hot encoding # and flattening of tuple and dict observations. For CartPole a no-op # preprocessor is used, but this may be relevant for more complex envs. prep = get_preprocessor(env.observation_space)(env.observation_space) print("The preprocessor is", prep) for eps_id in range(20): obs = env.reset() prev_action = np.zeros_like(env.action_space.sample()) prev_reward = 0 done = False t = 0 while not done: import ipdb ipdb.set_trace() action = env.action_space.sample()
import pickle import numpy as np from ray.rllib.evaluation.sample_batch_builder import SampleBatchBuilder from ray.rllib.offline.json_writer import JsonWriter batch_builder = SampleBatchBuilder() # or MultiAgentSampleBatchBuilder writer = JsonWriter("") actions = [ 'strength_1', 'strength_2', 'flexibility_1', 'flexibility_2', 'rest' ] action_to_int_converter = {actions[i]: i for i in range(len(actions))} # with open('episode_actions.pkl', 'rb') as file: episode_actions = pickle.load(file) with open('episode_states.pkl', 'rb') as file: episode_states = pickle.load(file) n_episodes = len(episode_actions) states0 = episode_states[0] actions0 = episode_actions[0] # import ipdb; ipdb.set_trace() # save in batches for episode_idx in range(n_episodes): obs = np.array([0, 0, 0]) prev_action = None prev_reward = None done = False n_actions = len(episode_actions[episode_idx])
"""Simple example of writing experiences to a file using JsonWriter.""" # __sphinx_doc_begin__ import gym import numpy as np import os import ray.utils from ray.rllib.models.preprocessors import get_preprocessor from ray.rllib.evaluation.sample_batch_builder import SampleBatchBuilder from ray.rllib.offline.json_writer import JsonWriter if __name__ == "__main__": batch_builder = SampleBatchBuilder() # or MultiAgentSampleBatchBuilder writer = JsonWriter(os.path.join(ray.utils.get_user_temp_dir(), "demo-out")) # You normally wouldn't want to manually create sample batches if a # simulator is available, but let's do it anyways for example purposes: env = gym.make("CartPole-v0") # RLlib uses preprocessors to implement transforms such as one-hot encoding # and flattening of tuple and dict observations. For CartPole a no-op # preprocessor is used, but this may be relevant for more complex envs. prep = get_preprocessor(env.observation_space)(env.observation_space) print("The preprocessor is", prep) for eps_id in range(100): obs = env.reset() prev_action = np.zeros_like(env.action_space.sample()) prev_reward = 0
'R': np.diag(([arcsec2rad**2] * 2 + [1e3**2])), 'alpha': 0.0001, 'beta': 2., 'kappa': 3 - 6, 'fx': fx, 'hx': hx, 'mean_z': mean_z, 'residual_z': residual_z, 'msqrt': robust_cholesky, 'orbits': sample_orbits, } if __name__ == "__main__": batch_builder = SampleBatchBuilder() # or MultiAgentSampleBatchBuilder writer = JsonWriter( os.path.join(ray.utils.get_user_temp_dir(), "agent_visible_random_10RSOs-out")) # You normally wouldn't want to manually create sample batches if a # simulator is available, but let's do it anyways for example purposes: env = gym.make('ssa_tasker_simple-v2', **{'config': env_config}) # RLlib uses preprocessors to implement transforms such as one-hot encoding # and flattening of tuple and dict observations. For CartPole a no-op # preprocessor is used, but this may be relevant for more complex envs. prep = get_preprocessor(env.observation_space)(env.observation_space) print("The preprocessor is", prep) for eps_id in tqdm(range(2084)): # obs = env.reset() prev_action = np.zeros_like(env.action_space.sample())
ray.init() cls = get_agent_class(args.run) agent = cls(env=args.env) agent.restore(args.checkpoint) # TODO: Refactor this... env = agent.workers.local_worker().env if args.target_env: env_creator = _global_registry.get(ENV_CREATOR, args.target_env) target_env = env_creator(env.config) else: target_env = None batch_builder = SampleBatchBuilder() writer = JsonWriter(args.out) for states_path in args.states: with open(states_path, 'r') as states_file: states = parse_states_file(states_file, env, args.skip_count) trajectory = annotated_trajectory(states, agent, env, target_env) prev_action = None for (t, (obs, action, new_obs)) in enumerate(trajectory): batch_builder.add_values( t=t, eps_id=0, agent_index=0, obs=obs, actions=action, action_prob= 1.0, # TODO: put the true action probability here rewards=0,
from __future__ import absolute_import from __future__ import division from __future__ import print_function """Simple example of writing experiences to a file using JsonWriter.""" # __sphinx_doc_begin__ import gym import numpy as np from ray.rllib.evaluation.sample_batch_builder import SampleBatchBuilder from ray.rllib.offline.json_writer import JsonWriter if __name__ == "__main__": batch_builder = SampleBatchBuilder() # or MultiAgentSampleBatchBuilder writer = JsonWriter("/tmp/demo-out") # You normally wouldn't want to manually create sample batches if a # simulator is available, but let's do it anyways for example purposes: env = gym.make("CartPole-v0") for eps_id in range(100): obs = env.reset() prev_action = np.zeros_like(env.action_space.sample()) prev_reward = 0 done = False t = 0 while not done: action = env.action_space.sample() new_obs, rew, done, info = env.step(action) batch_builder.add_values( t=t,
def main(): args = parser.parse_args() if args.save_path is None: save_path = os.path.join(args.data_path, 'rllib') else: save_path = args.save_path if args.env is None: env_list = [] for env_spec in minerl.herobraine.envs.obfuscated_envs: env_list.append(env_spec.name) else: env_list = [args.env] register() for env_name in env_list: env = gym.make(env_name) env = env.MineRLObservationWrapper(env.MineRLActionWrapper(env)) batch_builder = SampleBatchBuilder() writer = JsonWriter(os.path.join(save_path, env_name)) prep = get_preprocessor(env.observation_space)(env.observation_space) env.close() data = minerl.data.make(env_name, data_dir=args.data_path) for trajectory_name in data.get_trajectory_names(): t = 0 prev_action = None prev_reward = 0 done = False obs = None info = None for obs, action, reward, next_obs, done in data.load_data( trajectory_name): obs = (obs['pov'], obs['vector']) next_obs = (next_obs['pov'], next_obs['vector']) action = action['vector'] if prev_action is None: prev_action = np.zeros_like(action) batch_builder.add_values( t=t, eps_id=trajectory_name, agent_index=0, obs=prep.transform(obs), actions=action, action_prob=1.0, # put the true action probability here rewards=reward, prev_actions=prev_action, prev_rewards=prev_reward, dones=done, infos=info, new_obs=prep.transform(next_obs)) prev_action = action prev_reward = reward t += 1 writer.write(batch_builder.build_and_reset())
def write_jsons( environment, data_dir, env_config, save_path, overwrite=False, fail_safe=True, **kwargs, ): data_pipeline = minerl.data.make(environment, data_dir, **kwargs) env = MinerRLDataEnv(data_pipeline) env = wrap_env(env, env_config) if not os.path.exists(save_path): os.makedirs(save_path) if len(os.listdir(save_path)) != 0: abs_save_path = os.path.abspath(save_path) if overwrite: print(f"Overwriting! {abs_save_path}") shutil.rmtree(abs_save_path) else: if fail_safe: print(f"Json data already exists at {abs_save_path}") return else: raise ValueError( f"Directory {abs_save_path} not empty!" f"Cannot overwrite existing data automatically, please delete old data if unused." ) batch_builder = SampleBatchBuilder() writer = JsonWriter(save_path) prep = get_preprocessor(env.observation_space)(env.observation_space) for eps_id, trajectory_name in enumerate(env.trajectory_names): t = 0 prev_action = None prev_reward = 0 done = False try: obs = env.reset() except TypeError: continue while not done: new_obs, reward, done, info = env.step(env.action_space.sample()) action = info["action"] action = env.reverse_action(action) if prev_action is None: prev_action = np.zeros_like(action) batch_builder.add_values( t=t, eps_id=eps_id, agent_index=0, obs=prep.transform(obs), actions=action, action_prob=1.0, # put the true action probability here rewards=reward, prev_actions=prev_action, prev_rewards=prev_reward, dones=done, infos={"trajectory_name": trajectory_name}, new_obs=prep.transform(new_obs), ) obs = new_obs prev_action = action prev_reward = reward t += 1 writer.write(batch_builder.build_and_reset())
return False if done: break if human_wants_restart: break while human_sets_pause: env.wrapped.render() time.sleep(0.1) time.sleep(0.1) print("timesteps %i reward %0.2f" % (total_timesteps, total_reward)) writer.write(batch_builder.build_and_reset()) if __name__ == "__main__": batch_builder = SampleBatchBuilder() # or MultiAgentSampleBatchBuilder writer = JsonWriter(DEMO_DATA_DIR) env = MountainCar() # RLlib uses preprocessors to implement transforms such as one-hot encoding # and flattening of tuple and dict observations. For CartPole a no-op # preprocessor is used, but this may be relevant for more complex envs. prep = get_preprocessor(env.observation_space)(env.observation_space) print("The preprocessor is", prep) if not hasattr(env.action_space, "n"): raise Exception("Keyboard agent only supports discrete action spaces") ACTIONS = env.action_space.n SKIP_CONTROL = 0 # Use previous control decision SKIP_CONTROL times, that's how you # can test what skip is still usable.
agents = [ agent_visible_greedy, agent_visible_greedy_spoiled, agent_naive_random ] rso_count = [10, 20, 40] env_config['obs_returned'] = 'flatten' env_config['reward_type'] = 'jones' episodes = 10000 #!------------ Save experiences generated by the agent for agent in agents: for m in rso_count: env_config['rso_count'] = m batch_builder = SampleBatchBuilder() # or MultiAgentSampleBatchBuilder writer = JsonWriter('/home/ash/ray_results/ssa_experiences/' + agent.__name__ + '/' + str(env_config['rso_count']) + 'RSOs_' + env_config['reward_type'] + '_' + env_config['obs_returned'] + '_' + str(episodes) + 'episodes') # You normally wouldn't want to manually create sample batches if a # simulator is available, but let's do it anyways for example purposes: env = gym.make('ssa_tasker_simple-v2', **{'config': env_config}) # RLlib uses preprocessors to implement transforms such as one-hot encoding # and flattening of tuple and dict observations. For CartPole a no-op # preprocessor is used, but this may be relevant for more complex envs. prep = get_preprocessor(env.observation_space)(env.observation_space) print("The preprocessor is", prep) for eps_id in tqdm(range(episodes)): obs = env.reset() prev_action = np.zeros_like(env.action_space.sample())
def main(args): try: opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""]) except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" sys.exit(2) sleep_for_animation = True for o, a in opts: if o in ("--sleep-for-animation"): sleep_for_animation = str2bool(a) else: assert False, "unhandled option" batch_builder = SampleBatchBuilder() # or MultiAgentSampleBatchBuilder writer = JsonWriter("./out/") # Setting these 2 parameters to True can slow down training visuals = False sleep_for_animation = False if visuals: from flatland.utils.rendertools import RenderTool max_depth = 30 tree_depth = 2 trial_start = 100 n_trials = 999 start = 0 columns = [ 'Agents', 'X_DIM', 'Y_DIM', 'TRIAL_NO', 'REWARD', 'NORMALIZED_REWARD', 'DONE_RATIO', 'STEPS', 'ACTION_PROB' ] df_all_results = pd.DataFrame(columns=columns) for trials in range(trial_start, n_trials + 1): env_file = f"envs-100-999/envs/Level_{trials}.pkl" # env_file = f"../env_configs/round_1-small/Test_0/Level_{trials}.mpk" # file = f"../env_configs/actions-small/Test_0/Level_{trials}.mpk" file = f"envs-100-999/actions/envs/Level_{trials}.json" if not os.path.isfile(env_file) or not os.path.isfile(file): print("Missing file!", env_file, file) continue step = 0 obs_builder_object = TreeObsForRailEnv( max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(max_depth)) env = RailEnv( width=1, height=1, rail_generator=rail_from_file(env_file), schedule_generator=schedule_from_file(env_file), malfunction_generator_and_process_data=malfunction_from_file( env_file), obs_builder_object=obs_builder_object) obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, activate_agents=False, random_seed=1001) with open(file, "r") as files: expert_actions = json.load(files) n_agents = env.get_num_agents() x_dim, y_dim = env.width, env.height agent_obs = [None] * n_agents agent_obs_buffer = [None] * n_agents done = dict() done["__all__"] = False if imitate: agent_action_buffer = list(expert_actions[step].values()) else: # , p=[0.2, 0, 0.5]) # [0] * n_agents agent_action_buffer = np.random.choice(5, n_agents, replace=True) update_values = [False] * n_agents max_steps = int(4 * 2 * (20 + env.height + env.width)) action_size = 5 # 3 # And some variables to keep track of the progress action_dict = dict() scores_window = deque(maxlen=100) reward_window = deque(maxlen=100) done_window = deque(maxlen=100) action_prob = [0] * action_size # agent = Agent(state_size, action_size) if visuals: env_renderer = RenderTool(env, gl="PILSVG") env_renderer.render_env(show=True, frames=True, show_observations=True) for a in range(n_agents): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 agent_action_buffer = np.zeros(n_agents) # prev_action = np.zeros_like(envs.action_space.sample()) prev_reward = np.zeros(n_agents) for step in range(max_steps): for a in range(n_agents): if info['action_required'][a]: if imitate: if step < len(expert_actions): action = expert_actions[step][str(a)] else: action = 0 else: action = 0 action_prob[action] += 1 update_values[a] = True else: update_values[a] = False action = 0 action_dict.update({a: action}) next_obs, all_rewards, done, info = env.step(action_dict) for a in range(n_agents): if next_obs[a] is not None: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) # Only update the values when we are done or when an action # was taken and thus relevant information is present if update_values[a] or done[a]: start += 1 batch_builder.add_values( t=step, eps_id=trials, agent_index=0, obs=agent_obs_buffer[a], actions=action_dict[a], action_prob=1.0, # put the true action probability rewards=all_rewards[a], prev_actions=agent_action_buffer[a], prev_rewards=prev_reward[a], dones=done[a], infos=info['action_required'][a], new_obs=agent_obs[a]) agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] prev_reward[a] = all_rewards[a] score += all_rewards[a] # / envs.get_num_agents() if visuals: env_renderer.render_env(show=True, frames=True, show_observations=True) if sleep_for_animation: time.sleep(0.5) if done["__all__"] or step > max_steps: writer.write(batch_builder.build_and_reset()) break # Collection information about training if step % 100 == 0: tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 print( '\rTrial No {} Training {} Agents on ({},{}).\t Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t' .format( trials, env.get_num_agents(), x_dim, y_dim, step, score, score / (max_steps + n_agents), 100 * np.mean( tasks_finished / max(1, env.get_num_agents()))), end=" ") tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) reward_window.append(score) scores_window.append(score / (max_steps + n_agents)) data = [[ n_agents, x_dim, y_dim, trials, np.mean(reward_window), np.mean(scores_window), 100 * np.mean(done_window), step, action_prob / np.sum(action_prob) ]] df_cur = pd.DataFrame(data, columns=columns) df_all_results = pd.concat([df_all_results, df_cur]) if imitate: df_all_results.to_csv( f'TreeImitationLearning_DQN_TrainingResults.csv', index=False) print( '\rTrial No {} Training {} Agents on ({},{}).\t Total Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t' .format(trials, env.get_num_agents(), x_dim, y_dim, step, np.mean(reward_window), np.mean(scores_window), 100 * np.mean(done_window))) if visuals: env_renderer.close_window() gc.collect()