def main(_): for map_name in env_names: if rl_algo == 'ddpg': from agent.ddpg import DDPGAgent from networks.acnetwork_q_seperated import ActorNet, CriticNet from utils.memory import SequentialMemory actor = ActorNet() critic = CriticNet() memory = SequentialMemory(limit=arglist.DDPG.memory_limit) learner = DDPGAgent(actor, critic, memory) elif rl_algo == 'ppo': from agent.ppo import PPOAgent from networks.acnetwork_v_seperated import ActorNet, CriticNet from utils.memory import EpisodeMemory actor = ActorNet() critic = CriticNet() memory = EpisodeMemory(limit=arglist.PPO.memory_limit, action_shape=arglist.action_shape, observation_shape=arglist.observation_shape) learner = PPOAgent(actor, critic, memory) else: raise NotImplementedError() preprocess = Preprocess() game = MiniGame(map_name, learner, preprocess, nb_episodes=10000) game.run_ddpg() return 0
def __init__(self, agent_id: int, enforce_env_name: str = None): """Chiefinvestigator can assign investigator to inspect the model and produce high-level analysis. Args: agent_id: ID of agent that will be analyzed. env_name: Name of the gym environment that the agent was trained in. Default is set to CartPole-v1 """ self.agent = PPOAgent.from_agent_state(agent_id, from_iteration='best') super().__init__(self.agent.policy, self.agent.distribution, self.agent.preprocessor) self.env = self.agent.env if enforce_env_name is not None: print( f"Enforcing environment {enforce_env_name} over agents original environment. If you want to use" f"the same environment as the original agent anyways, there is no need to specify it in the" f"constructor!") self.env = gym.make(enforce_env_name) self.agent.preprocessor = CombiWrapper([ StateNormalizationWrapper(self.agent.state_dim), RewardNormalizationWrapper() ]) # dirty fix, TODO remove soon self.weights = self.get_layer_weights('policy_recurrent_layer') self.n_hidden = self.weights[1].shape[0] self._get_rnn_type() self.sub_model_from = build_sub_model_from(self.network, "beta_action_head")
def evaluate(): """Evaluate an agent.""" if request.method == "POST": try: agent = PPOAgent.from_agent_state(request.json['id']) evaluation_stats, _ = agent.evaluate(10, save=True) return {"results": evaluation_stats._asdict()} except Exception as e: return {"success": e.__repr__()} return {"success": "success"}
if len(sys.argv) < 4: print("Usage: python train.py [stock] [window] [episodes]") exit() stock_name, window_size, episode_count = sys.argv[1], int(sys.argv[2]), int( sys.argv[3]) model_file = None if len(sys.argv) == 5: model_file = sys.argv[4] if __name__ == '__main__': actor_model_file = None critic_model_file = None if model_file is not None: model_file_path = os.path.dirname(os.path.abspath(model_file)) base_filename = os.path.basename(os.path.abspath(model_file)) base_filename = os.path.splitext(base_filename)[0] index = base_filename.rfind("_") episode_to_load = base_filename[index + 1:] critic_model_file = os.path.join( model_file_path, "model_critic_" + stock_name + "_" + episode_to_load + ".h5") actor_model_file = os.path.join( model_file_path, "model_actor_" + stock_name + "_" + episode_to_load + ".h5") agent = PPOAgent(StockPredict(window_size, stock_name, episode_count), actor_model_file, critic_model_file) agent.run(episode_count)
def test(): parser = argparse.ArgumentParser( description='Test an agent in the ViZDoom environment.') parser.add_argument('agent_path', help='path to the agent checkpoint') parser.add_argument('--show_game', dest='show_game', default=False, action='store_true', help='whether to show the game while agent is playing') parser.add_argument('--record', dest='record', default=False, action='store_true', help='whether to record the agent playing') parser.add_argument('--output_path', dest='output_path', help='output path for the replay') parser.add_argument('--cuda', dest='cuda', default=False, action='store_true', help='whether to use cuda') parser.add_argument('--n_games', dest='n_games', default=1, type=int, help='number of games to play') args = parser.parse_args() env_params, progress_params, agent_params = CheckpointMonitor.load( args.agent_path) game = initialize_vizdoom(env_params["map_name"], args.show_game) actions = env_params["env"]["actions"] in_channels = env_params["env"]["state_dim"][0] * env_params["env"][ "frames_per_state"] if env_params["agent"]["nn"] == 'deepmind_cnn': feature_net = CNN(in_channels) elif env_params["agent"]["nn"] == 'capsnet': feature_net = CapsNet(in_channels) if env_params["agent"]["alg"] == 'ppo': policy = ActorCriticPolicy(feature_net, len(actions)) agent = PPOAgent(policy, None, None, cuda=args.cuda) elif env_params["agent"]["alg"] == 'a2c': policy = ActorCriticPolicy(feature_net, len(actions)) agent = A2CAgent(policy, None, cuda=args.cuda) elif env_params["agent"]["alg"] == 'dqn': q_net = QNetwork(feature_net, len(actions)) agent = DQNAgent(q_net, q_net, None, None, None, cuda=args.cuda) agent.load(agent_params) checkpoint_monitor = CheckpointMonitor(env_params, agent) generator = TrajectoryGenerator( game, 0, 0, agent, param_schedules=progress_params.get("schedules", None), monitors=[checkpoint_monitor, env_params["progress_monitor"]], **env_params["env"]) mean, std, max, min, frames = generator.test(args.n_games, args.record) print("Score: %1.f +/- %1.f, max: %1.f, min: %1.f" % (mean, std, max, min)) if args.record: save_recording(frames, args.output_path)
from agent.ppo import PPOAgent from utilities.const import QUALITATIVE_COLOR_PALETTE, PATH_TO_EXPERIMENTS os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' AGENT_ID = 1587117437 os.chdir("../../") with open(f"{PATH_TO_EXPERIMENTS}/{AGENT_ID}/progress.json", "r") as f: data = json.load(f) with open(f"{PATH_TO_EXPERIMENTS}/{AGENT_ID}/meta.json", "r") as f: meta = json.load(f) agent = PPOAgent.from_agent_state(AGENT_ID, "b", path_modifier="") mean_rewards = data["rewards"]["mean"] mean_rewards_smooth = savgol_filter(mean_rewards, 51, 3) std_rewards = data["rewards"]["stdev"] axs: List[Axes] fig: Figure = plt.figure(figsize=(12, 4)) grid = plt.GridSpec(1, 3) progression_ax = fig.add_subplot(grid[:2]) progression_ax.set_xlim(0, len(mean_rewards)) progression_ax.axhline(meta["environment"]["reward_threshold"], ls="--", color="grey") \ if (meta["environment"]["reward_threshold"] != "None" and meta["environment"]["reward_threshold"] is not None) else None progression_ax.plot(mean_rewards,
#!/usr/bin/env python """Example script on loading and inspecting an agent.""" import os from agent.ppo import PPOAgent from analysis.investigation import Investigator os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = '-1' os.chdir("../") agent = PPOAgent.from_agent_state(1580042580) inv = Investigator.from_agent(agent) # render agent at different steps inv.render_episode(agent.env)
type=int, nargs="?", help="id of the agent, defaults to newest", default=None) parser.add_argument("-n", type=int, help="number of evaluation episodes", default=10) args = parser.parse_args() if args.id is None: ids = map(int, os.listdir(BASE_SAVE_PATH)) args.id = max(ids) start = time.time() agent = PPOAgent.from_agent_state(args.id, "b") print(f"Agent {args.id} successfully loaded.") stats, _ = agent.evaluate(args.n) average_reward = round(statistics.mean(stats.episode_rewards), 2) average_length = round(statistics.mean(stats.episode_lengths), 2) std_reward = round(statistics.stdev(stats.episode_rewards), 2) std_length = round(statistics.stdev(stats.episode_lengths), 2) print( f"Evaluated agent on {args.n} x {agent.env_name} and achieved an average reward of {average_reward} [std: {std_reward}; " f"between ({min(stats.episode_rewards)}, {max(stats.episode_rewards)})].\n" f"An episode on average took {average_length} steps [std: {std_length}; " f"between ({min(stats.episode_lengths)}, {max(stats.episode_lengths)})].\n" f"This took me {round(time.time() - start, 2)}s.")
#!/usr/bin/env python """Example script on loading agent and rendering episodes.""" import os from agent.ppo import PPOAgent from analysis.investigation import Investigator os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = '-1' os.chdir("../") AGENT_ID = 1580042580 latest_agent = PPOAgent.from_agent_state(AGENT_ID) persistent_env = latest_agent.env # iterate over every save of the agent during training to see evolution of behaviour for iteration in PPOAgent.get_saved_iterations(AGENT_ID): # load agent and wrap an investigator around it agent = PPOAgent.from_agent_state(AGENT_ID, from_iteration=iteration) inv = Investigator.from_agent(agent) # render a randomly initialized episode inv.render_episode(persistent_env) # just print a line to make output more readable print()
def run_experiment(environment, settings: dict, verbose=True, init_ray=True, use_monitor=False) -> PPOAgent: """Run an experiment with the given settings .""" # sanity checks and warnings for given parameters if settings["preload"] is not None and settings["load_from"] is not None: raise InconsistentArgumentError("You gave both a loading from a pretrained component and from another " "agent state. This cannot be resolved.") # setup environment and extract and report information env = gym.make(environment) state_dim, number_of_actions = env_extract_dims(env) env_action_space_type = "continuous" if isinstance(env.action_space, Box) else "discrete" env_observation_space_type = "continuous" if isinstance(env.observation_space, Box) else "discrete" env_name = env.unwrapped.spec.id if env.spec.max_episode_steps is not None and env.spec.max_episode_steps > settings["horizon"] and not settings[ "eval"]: logging.warning("Careful! Your horizon is lower than the max reward, this will most likely skew stats heavily.") # choose and make policy distribution if settings["distribution"] is None: settings["distribution"] = "categorical" if env_action_space_type == "discrete" else "gaussian" distribution = get_distribution_by_short_name(settings["distribution"])(env) # setting appropriate model building function if "ShadowHand" in environment or settings["architecture"] == "shadow": if settings["model"] == "ffn": print("Cannot use ffn with shadow architecture. Defaulting to GRU.") settings["model"] = "gru" if env.visual_input: build_models = get_model_builder(model="shadow", model_type=settings["model"], shared=settings["shared"]) else: build_models = build_blind_shadow_brain_v1 else: build_models = get_model_builder(model=settings["architecture"], model_type=settings["model"], shared=settings["shared"]) # make preprocessor preprocessor = CombiWrapper( [StateNormalizationWrapper(state_dim) if not settings["no_state_norming"] else SkipWrapper(), RewardNormalizationWrapper() if not settings["no_reward_norming"] else SkipWrapper()]) # announce experiment bc, ec, wn = COLORS["HEADER"], COLORS["ENDC"], COLORS["WARNING"] if verbose: print(f"-----------------------------------------\n" f"{wn}Learning the Task{ec}: {bc}{env_name}{ec}\n" f"{bc}{state_dim}{ec}-dimensional states ({bc}{env_observation_space_type}{ec}) " f"and {bc}{number_of_actions}{ec} actions ({bc}{env_action_space_type}{ec}).\n" f"Config: {settings['config']}\n" f"Model: {build_models.__name__}\n" f"Distribution: {settings['distribution']}\n" f"-----------------------------------------\n") print(f"{wn}HyperParameters{ec}: {settings}\n") if settings["cpu"]: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if settings["load_from"] is not None: if verbose: print(f"{wn}Loading{ec} from state {settings['load_from']}") agent = PPOAgent.from_agent_state(settings["load_from"]) else: # set up the agent and a reporting module agent = PPOAgent(build_models, env, horizon=settings["horizon"], workers=settings["workers"], learning_rate=settings["lr_pi"], lr_schedule=settings["lr_schedule"], discount=settings["discount"], clip=settings["clip"], c_entropy=settings["c_entropy"], c_value=settings["c_value"], lam=settings["lam"], gradient_clipping=settings["grad_norm"], clip_values=settings["clip_values"], tbptt_length=settings["tbptt"], distribution=distribution, preprocessor=preprocessor, pretrained_components=None if settings["preload"] is None else [settings["preload"]], debug=settings["debug"]) print(f"{wn}Created agent{ec} with ID {bc}{agent.agent_id}{ec}") if tf.test.is_gpu_available(): agent.set_gpu(not settings["cpu"]) else: agent.set_gpu(False) monitor = None if use_monitor: monitor = Monitor(agent, env, frequency=settings["monitor_frequency"], gif_every=settings["gif_every"], iterations=settings["iterations"], config_name=settings["config"]) redis_auth = None if settings["redis_ip"] is None else [settings["redis_ip"], settings["redis_pw"]] agent.drill(n=settings["iterations"], epochs=settings["epochs"], batch_size=settings["batch_size"], monitor=monitor, export=settings["export_file"], save_every=settings["save_every"], separate_eval=settings["eval"], stop_early=settings["stop_early"], parallel=not settings["sequential"], ray_is_initialized=not init_ray, radical_evaluation=settings["radical_evaluation"], redis_auth=redis_auth) agent.save_agent_state() env.close() return agent
from agent.ppo import PPOAgent from test_environment import TestEnvironment import sys import os import argparse parser = argparse.ArgumentParser(description='test with ppo.') parser.add_argument('--stock_name', dest='stock_name', required=True, help='name of stock') parser.add_argument('--window', dest='window', default=10, help='number of candles in window') parser.add_argument('--model_file', dest='model_file', required=True, help='checkpoint file relative to models directory') args = parser.parse_args() agent = PPOAgent(TestEnvironment(args.window, args.stock_name, 1000), args.model_file) agent.test()
def train(): parser = argparse.ArgumentParser( description='Train an agent in the ViZDoom environment.') parser.add_argument('map_name', help='path to the map config') parser.add_argument('--output_path', dest='output_path', help='output path for agent checkpoints') parser.add_argument( '--save_interval', dest='save_interval', default=10, type=int, help='interval, measured in epochs, between each agent checkpoint') parser.add_argument('--cuda', dest='cuda', default=False, action='store_true', help='whether to use cuda') parser.add_argument('--log_interval', dest='log_interval', default=10, type=int, help='interval between each progress update log') parser.add_argument( '--score_buffer_size', dest='score_buffer_size', default=50, type=int, help= 'the amount of last scores that will be saved to compute statistics') parser.add_argument('--n_epochs', dest='n_epochs', default=1000, type=int, help='number of epochs') parser.add_argument('--epoch_len', dest='epoch_len', default=1024, type=int, help='the length of an epoch') parser.add_argument('--lr', dest='lr', default=2.5e-4, type=float, help='learning rate') parser.add_argument('--lr_decay', dest='decay_lr', default=False, help='whether to decay learning rate each epoch') parser.add_argument('--gamma', dest='gamma', default=0.99, type=float, help='discount factor') parser.add_argument('--batch_size', dest='batch_size', default=32, type=int, help='batch size') parser.add_argument('--alg', dest='alg', default='ppo', choices=['ppo', 'dqn', 'a2c'], help='the algorithm the agent will use') parser.add_argument( '--nn', dest='nn', default='deepmind_cnn', choices=['deepmind_cnn', 'capsnet'], help='neural network that the agent will use as its feature network') parser.add_argument('--frame_skip', dest='frame_skip', default=4, type=int, help='number of frames to skip each action') parser.add_argument('--frames_per_state', dest='frames_per_state', default=4, type=int, help='number of frames to stack every state') parser.add_argument('--state_w', dest='state_w', default=108, type=int, help='target state width to resize each frame to') parser.add_argument('--state_h', dest='state_h', default=60, type=int, help='target state height to resize each frame to') parser.add_argument('--state_rgb', dest='rgb', default=False, action='store_true', help='whether to use rgb or gray frames') parser.add_argument( '--shape_rewards', dest='shape_rewards', default=False, action='store_true', help= 'whether to use a reward shaping function specified for the selected map' ) parser.add_argument( '--use_default_actions_for_map', dest='use_default_actions', default=False, action='store_true', help= 'whether to use a default set of actions specified for the selected map' ) parser.add_argument('--ppo_lambda', dest='lam', default=0.95, type=float, help='lambda value for GAE') parser.add_argument('--ppo_eps', dest='eps', default=0.1, type=float, help='clipping parameter for PPO') parser.add_argument( '--ppo_decay_params', dest='ppo_decay', default=False, action='store_true', help= 'whether to decay PPO learning rate and epsilon each epoch linearly') parser.add_argument('--ppo_ent_coeff', dest='ent_coeff', default=0.01, type=float, help='entropy coefficient for PPO') parser.add_argument('--ppo_value_coeff', dest='value_coeff', default=1.0, type=float, help='value coefficient for PPO') parser.add_argument('--ppo_opt_epochs', dest='opt_epochs', default=4, type=int, help='number of optimization epochs for PPO') parser.add_argument('--dqn_use_ddqn', dest='ddqn', default=False, action='store_true', help='whether to use ddqn instead of dqn') parser.add_argument('--dqn_dueling', dest='dueling', default=False, action='store_true', help='whether to use a dueling architecture in dqn') parser.add_argument('--dqn_min_eps', dest='min_eps', default=0.01, type=float, help='minimum value of epsilon for dqn') parser.add_argument('--dqn_mem_size', dest='memory_size', default=100000, type=int, help='replay memory size for dqn') parser.add_argument('--dqn_init_size', dest='init_size', default=10000, type=int, help='number of timesteps before dqn starts learning') parser.add_argument('--dqn_q_update_interval', dest='q_update_interval', default=1, type=int, help='the interval between updates of the q function') parser.add_argument( '--dqn_target_update_interval', dest='target_update_interval', default=1000, type=int, help='the interval between updated of the target q function') args = parser.parse_args() game = initialize_vizdoom(args.map_name) if args.use_default_actions: actions = default_actions_for_map(game, args.map_name) else: actions = all_actions(game) reward_fn = default_reward_shaping( args.map_name) if args.shape_rewards else None in_channels = args.frames_per_state * (3 if args.rgb else 1) if args.nn == 'deepmind_cnn': feature_net = CNN(in_channels) elif args.nn == 'capsnet': feature_net = CapsNet(in_channels) if args.alg == 'ppo': policy = ActorCriticPolicy(feature_net, len(actions)) optimizer = torch.optim.Adam(policy.parameters(), lr=args.lr) eps_sched = LinearSchedule("eps", args.eps, 1, args.n_epochs, end_val=1.0 if not args.ppo_decay else 0.0) lr_sched = LRWrapper( optimizer, LinearSchedule("lr", args.lr, 1, args.n_epochs, end_val=1.0 if not args.ppo_decay else 0.0)) schedules = [lr_sched, eps_sched] agent = PPOAgent(policy, optimizer, eps_sched, cuda=args.cuda, n_timesteps=args.epoch_len, batch_size=args.batch_size, opt_epochs=args.opt_epochs, gamma=args.gamma, lam=args.lam, entropy_coeff=args.ent_coeff, value_coeff=args.value_coeff) elif args.alg == 'a2c': policy = ActorCriticPolicy(feature_net, len(actions)) optimizer = torch.optim.Adam(policy.parameters(), lr=args.lr) lr_sched = LRWrapper( optimizer, LinearSchedule("lr", args.lr, 1, args.n_epochs, end_val=1.0 if not args.decay_lr else 0.0)) schedules = [lr_sched] agent = A2CAgent(policy, optimizer, args.cuda, args.gamma, args.epoch_len) elif args.alg == 'dqn': q = QNetwork(feature_net, len(actions)) tq = QNetwork(feature_net, len(actions)) optimizer = torch.optim.Adam(q.parameters(), lr=args.lr) memory = ReplayMemory(args.memory_size) eps_sched = LinearSchedule("eps", 1, 1, args.n_epochs, end_val=args.min_eps) lr_sched = LRWrapper( optimizer, LinearSchedule("lr", args.lr, 1, args.n_epochs, end_val=1.0 if not args.decay_lr else 0.0)) schedules = [lr_sched, eps_sched] agent = DQNAgent(q, tq, optimizer, memory, eps_sched, cuda=args.cuda, init_steps=args.init_size, q_update_interval=args.q_update_interval, target_update_interval=args.target_update_interval, ddqn=args.ddqn, gamma=args.gamma, batch_size=args.batch_size) progress_monitor = ProgressMonitor(args.score_buffer_size, monitor_interval=args.log_interval) env_params = { "env": { "frame_skip": args.frame_skip, "frames_per_state": args.frames_per_state, "state_dim": (3 if args.rgb else 1, args.state_h, args.state_w), "actions": actions }, "agent": { "alg": args.alg, "nn": args.nn }, "save_path": args.output_path, "save_interval": args.save_interval, "progress_monitor": progress_monitor, "map_name": args.map_name } if args.output_path: checkpoint_monitor = CheckpointMonitor(env_params, agent) monitors = [checkpoint_monitor, progress_monitor] else: monitors = [progress_monitor] generator = TrajectoryGenerator(game, args.n_epochs, args.epoch_len, agent, shape_reward_fn=reward_fn, monitors=monitors, param_schedules=schedules, **env_params["env"]) generator.run()