def make_ddqn_agent(obs_space_dim, action_space_dim): gamma = 1 obs_low = np.array([-1] * obs_space_dim) obs_high = np.array([1] * obs_space_dim) ac_low = np.array([-1] * action_space_dim) ac_high = np.array([1] * action_space_dim) obsSpace = gym.spaces.Box(obs_low, obs_high) actSpace = gym.spaces.Box(ac_low, ac_high) qFunc = q_functions.FCQuadraticStateQFunction(obsSpace.low.size, actSpace.low.size, n_hidden_channels=50, n_hidden_layers=2, action_space=actSpace) optimizer = chainer.optimizers.Adam(eps=1e-2) optimizer.setup(qFunc) # Use AdditiveOU for exploration ou_sigma = (actSpace.high - actSpace.low) * 0.25 explorer = explorers.AdditiveOU(sigma=ou_sigma) # DQN uses Experience Replay. # Specify a replay buffer and its capacity. replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10**6) phi = lambda x: x.astype(np.float32, copy=False) agent = chainerrl.agents.DoubleDQN(qFunc, optimizer, replay_buffer, gamma, explorer, replay_start_size=500, update_interval=1, target_update_interval=100, phi=phi) return agent
def build_agent() -> DoubleDQN: # observation: # friction on each somite (#somite) # tension on each somite except for both ends (#somite - 2) # cos(somite phases), sin(somites phases) (#oscillator x 2) # cos(gripper phases), sin(gripper phases) (#gripper x 2) obs_size = config.somites + (config.somites - 2) + config.oscillators*2 + config.grippers*2 # actions: feedbacks to somite oscillators, feedbacks to gripper oscillators action_size = config.oscillators + config.grippers action_space = spaces.Box(low=-F_OUTPUT_BOUND, high=F_OUTPUT_BOUND, shape=(action_size)) q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=6, n_hidden_layers=2, action_space=action_space) opt = optimizers.Adam() opt.setup(q_func) explorer = explorers.AdditiveOU(sigma=OU_SIGMA) rbuf = replay_buffer.ReplayBuffer(capacity=5 * 10 ** 5) phi = lambda x: x.astype(np.float32) agent = DoubleDQN(q_func, opt, rbuf, gamma=GAMMA, explorer=explorer, phi=lambda x: x.astype(np.float32), gpu=GPU, replay_start_size=10000) return agent
def make_q_func(self, env): n_dim_action = env.action_space.low.size n_dim_obs = env.observation_space.low.size return q_functions.FCQuadraticStateQFunction( n_input_channels=n_dim_obs, n_dim_action=n_dim_action, n_hidden_channels=20, n_hidden_layers=2, action_space=env.action_space)
def dqn_q_values_and_neuronal_net(self, args, action_space, obs_size, obs_space): """ learning process """ if isinstance(action_space, spaces.Box): action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n # print("n_actions: ", n_actions) q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # print("q_func ", q_func) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) # print("explorer: ", explorer) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # print("obs_space.low : ", obs_space.shape) chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) return q_func, opt, rbuf, explorer
def create_state_q_function_for_env(env): assert isinstance(env.observation_space, gym.spaces.Box) ndim_obs = env.observation_space.low.size if isinstance(env.action_space, gym.spaces.Discrete): return q_functions.FCStateQFunctionWithDiscreteAction( ndim_obs=ndim_obs, n_actions=env.action_space.n, n_hidden_channels=200, n_hidden_layers=2) elif isinstance(env.action_space, gym.spaces.Box): return q_functions.FCQuadraticStateQFunction( n_input_channels=ndim_obs, n_dim_action=env.action_space.low.size, n_hidden_channels=200, n_hidden_layers=2, action_space=env.action_space) else: raise NotImplementedError()
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='Pendulum-v0') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--final-exploration-steps', type=int, default=10**4) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--noisy-net-sigma', type=float, default=None) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--steps', type=int, default=10**5) parser.add_argument('--prioritized-replay', action='store_true') parser.add_argument('--replay-start-size', type=int, default=1000) parser.add_argument('--target-update-interval', type=int, default=10**2) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=100) parser.add_argument('--eval-interval', type=int, default=10**4) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--render-train', action='store_true') parser.add_argument('--render-eval', action='store_true') parser.add_argument('--monitor', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e-3) args = parser.parse_args() # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = DQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, train_max_episode_len=timestep_limit)
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='dqn_out') parser.add_argument('--env', type=str, default='Pendulum-v0') parser.add_argument('--seed', type=int, default=None) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--final-exploration-steps', type=int, default=10**4) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--steps', type=int, default=10**5) parser.add_argument('--prioritized-replay', action='store_true') parser.add_argument('--episodic-replay', action='store_true') parser.add_argument('--replay-start-size', type=int, default=1000) parser.add_argument('--target-update-interval', type=int, default=10**2) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=100) parser.add_argument('--eval-interval', type=int, default=10**4) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--render-train', action='store_true') parser.add_argument('--render-eval', action='store_true') parser.add_argument('--monitor', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e-3) args = parser.parse_args() args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) if args.seed is not None: misc.set_random_seed(args.seed) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(for_eval): env = gym.make(args.env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not for_eval: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if ((args.render_eval and for_eval) or (args.render_train and not for_eval)): misc.env_modifiers.make_rendered(env) return env env = make_env(for_eval=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10**5 if args.episodic_replay: if args.minibatch_size is None: args.minibatch_size = 4 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity) else: if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) def phi(obs): return obs.astype(np.float32) agent = DQN(q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, phi=phi, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, episodic_update=args.episodic_replay, episodic_update_len=16) if args.load: agent.load(args.load) eval_env = make_env(for_eval=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, max_episode_len=timestep_limit)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='/tmp/chainerRL_results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--final-exploration-steps', type=int, default=10**4) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--noisy-net-sigma', type=float, default=None) parser.add_argument('--evaluate', action='store_true', default=False, help="Run evaluation mode") parser.add_argument('--load', type=str, default=None, help="Load saved_model") parser.add_argument('--steps', type=int, default=4 * 10**6) parser.add_argument('--prioritized-replay', action='store_true') parser.add_argument('--replay-start-size', type=int, default=1000) parser.add_argument('--target-update-interval', type=int, default=5 * 10**2) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=1) parser.add_argument('--eval-interval', type=int, default=1e4, help="After how many steps to evaluate the agent." "(-1 -> always)") parser.add_argument('--n-hidden-channels', type=int, default=20) parser.add_argument('--n-hidden-layers', type=int, default=20) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--render-train', action='store_true') parser.add_argument('--render-eval', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1) parser.add_argument('--time-step-limit', type=int, default=1e5) parser.add_argument('--outdir-time-suffix', choices=['empty', 'none', 'time'], default='empty', type=str.lower) parser.add_argument('--checkpoint_frequency', type=int, default=1e3, help="Nuber of steps to checkpoint after") parser.add_argument('--verbose', '-v', action='store_true', help='Use debug log-level') args = parser.parse_args() import logging logging.basicConfig( level=logging.INFO if not args.verbose else logging.DEBUG) # Set a random seed used in ChainerRL ALSO SETS NUMPY SEED! misc.set_random_seed(args.seed) if args.outdir and not args.load: outdir_suffix_dict = { 'none': '', 'empty': '', 'time': '%Y%m%dT%H%M%S.%f' } args.outdir = experiments.prepare_output_dir( args, args.outdir, argv=sys.argv, time_format=outdir_suffix_dict[args.outdir_time_suffix]) elif args.load: if args.load.endswith(os.path.sep): args.load = args.load[:-1] args.outdir = os.path.dirname(args.load) count = 0 fn = os.path.join(args.outdir.format(count), 'scores_{:>03d}') while os.path.exists(fn.format(count)): count += 1 os.rename(os.path.join(args.outdir, 'scores.txt'), fn.format(count)) if os.path.exists(os.path.join(args.outdir, 'best')): os.rename(os.path.join(args.outdir, 'best'), os.path.join(args.outdir, 'best_{:>03d}'.format(count))) logging.info('Output files are saved in {}'.format(args.outdir)) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(test): HOST = '' # The server's hostname or IP address PORT = 54321 # The port used by the server if test: # Just such that eval and train env don't have the same port PORT += 1 # TODO don't hardcode env params # TODO if we use this solution (i.e. write port to file and read it with FD) we would have to make sure that # outdir doesn't append time strings. Otherwise it will get hard to use on the cluster env = FDEnvSelHeur(host=HOST, port=PORT, num_heuristics=2, config_dir=args.outdir) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) # state = env.reset() # while True: # for x in [1,1,1,1,0,0,0,0]: # state, reward, done, _ = env.step(x) # print(x) # if done: # break timestep_limit = args.time_step_limit obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): # Usefull if we want to control action_size = action_space.low.size # other continous parameters # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # q_func = FCDuelingDQN( # obs_size, n_actions) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. if not args.load: chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam(eps=1e-2) logging.info('Optimizer: %s', str(opt)) opt.setup(q_func) opt.add_hook(GradientClipping(5)) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 # args.minibatch_size = 16 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = DDQN( q_func, opt, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) t_offset = 0 if args.load: # Continue training model or load for evaluation agent.load(args.load) rbuf.load(os.path.join(args.load, 'replay_buffer.pkl')) try: t_offset = int(os.path.basename(args.load).split('_')[0]) except TypeError: with open(os.path.join(args.load, 't.txt'), 'r') as fh: data = fh.readlines() t_offset = int(data[0]) except ValueError: t_offset = 0 eval_env = make_env(test=False) if args.evaluate: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: criterion = 'steps' # can be made an argument if we support any other form of checkpointing l = logging.getLogger('Checkpoint_Hook') def checkpoint(env, agent, step): if criterion == 'steps': if step % args.checkpoint_frequency == 0: save_agent_and_replay_buffer( agent, step, args.outdir, suffix='_chkpt', logger=l, chckptfrq=args.checkpoint_frequency) else: # TODO seems to checkpoint given wall_time we would have to modify the environment such that it tracks # time or number of episodes raise NotImplementedError hooks = [checkpoint] experiments.train_agent(agent=agent, env=env, steps=args.steps, outdir=args.outdir, step_hooks=hooks, step_offset=t_offset)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) if not os.path.exists(args.outdir): os.makedirs(args.outdir) print('Output files are saved in {}'.format(args.outdir)) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) env = chainerrl.wrappers.CastObservationToFloat32(env) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): print("Use NAF to apply DQN to continuous action spaces") action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: print("not continuous action spaces") n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10 ** 5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = DoubleDQN(q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load_agent: agent.load(args.load_agent) eval_env = make_env(test=True) if (args.mode=='train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, train_max_episode_len=timestep_limit, log_type=args.log_type ) elif (args.mode=='check'): from matplotlib import animation import matplotlib.pyplot as plt frames = [] for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode = 'rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1]/72.0, frames[0].shape[0]/72.0),dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames),interval=50) anim.save(args.save_mp4) return anim
def chokoDQN(env, args=None): args = args or [] if (type(args) is list): args = make_args(args) obs_space = env.observation_space obs_size = obs_space.low.size * args.stack_k action_space = env.action_space if isinstance(action_space, spaces.Box): action_size = action_space.low.size q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = DQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) return agent
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) print('Output files are saved in {}'.format(args.outdir)) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): print("Use NAF to apply DQN to continuous action spaces") action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: print("not continuous action spaces") n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10 ** 5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = DQN(q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load_agent: agent.load(args.load_agent) eval_env = make_env(test=True) if (args.mode=='train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, train_max_episode_len=args.max_episode_len, log_type=args.log_type ) elif (args.mode=='check'): return tools.make_video.check(env=env,agent=agent,save_mp4=args.save_mp4) elif (args.mode=='growth'): return tools.make_video.growth(env=env,agent=agent,outdir=args.outdir,max_num=args.max_episode_len,save_mp4=args.save_mp4)