def __init__(self, registry, env_creator, config, logdir): env = env_creator(config["env_config"]) env = wrap_dqn(registry, env, config["model"]) self.env = env self.config = config tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(registry, env, config, logdir) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.global_timestep = 0 self.local_timestep = 0 # Note that this encompasses both the Q and target network self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset()
def __init__(self, env_creator, config, logdir): env = env_creator() env = wrap_dqn(env, config["model"]) self.env = env self.config = config tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(env, config, logdir) # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = \ config["prioritized_replay_beta_iters"] if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = \ config["schedule_max_timesteps"] self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.set_weights_time = RunningStat(()) self.sample_time = RunningStat(()) self.grad_time = RunningStat(()) # Note that workers don't need target vars to be synced self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() self.file_writer = tf.summary.FileWriter(logdir, self.sess.graph)
def __init__(self, registry, env_creator, config, logdir, worker_index): env = env_creator(config["env_config"]) env = wrap_dqn(registry, env, config["model"], config["random_starts"]) self.env = env self.config = config # when env.action_space is of Box type, e.g., Pendulum-v0 # action_space.low is [-2.0], high is [2.0] # take action by calling, e.g., env.step([3.5]) if not isinstance(env.action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( env.action_space)) tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir) # Use either a different `eps` per worker, or a linear schedule. if config["per_worker_exploration"]: assert config["num_workers"] > 1, "This requires multiple workers" self.exploration = ConstantSchedule( config["noise_scale"] * 0.4 ** (1 + worker_index / float(config["num_workers"] - 1) * 7)) else: self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=config["noise_scale"] * 1.0, final_p=config["noise_scale"] * config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) # hard instead of soft self.ddpg_graph.update_target(self.sess, 1.0) self.global_timestep = 0 self.local_timestep = 0 # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.TensorFlowVariables( tf.group(self.ddpg_graph.q_tp0, self.ddpg_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset()
def __init__(self, registry, env_creator, config, logdir, worker_index): env = env_creator(config["env_config"]) env = wrap_dqn(registry, env, config["model"], config["random_starts"]) self.env = env self.config = config if not isinstance(env.action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( env.action_space)) tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.ddpg_graph.copy_target(self.sess) self.global_timestep = 0 self.local_timestep = 0 nb_actions = env.action_space.shape[-1] stddev = config["exploration_noise"] self.exploration_noise = OUNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) self.action_range = (-1., 1.) # Note that this encompasses both the Q and target network self.variables = ray.experimental.TensorFlowVariables( tf.group(self.ddpg_graph.critic_loss, self.ddpg_graph.action_loss), self.sess) self.max_action = env.action_space.high self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None # Technically not needed when not remote self.obs_filter = get_filter(config["observation_filter"], env.observation_space.shape) self.rew_filter = get_filter(config["reward_filter"], ()) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } self.obs = self.env.reset()
def __init__(self, registry, env_creator, config, logdir, worker_index): env = env_creator(config["env_config"]) env = wrap_dqn(registry, env, config["model"], config["random_starts"]) self.env = env self.config = config if not isinstance(env.action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( env.action_space)) tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(registry, env, config, logdir) # Use either a different `eps` per worker, or a linear schedule. if config["per_worker_exploration"]: assert config["num_workers"] > 1, "This requires multiple workers" self.exploration = ConstantSchedule( 0.4 ** ( 1 + worker_index / float(config["num_workers"] - 1) * 7)) else: self.exploration = LinearSchedule( schedule_timesteps=int( config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.global_timestep = 0 self.local_timestep = 0 # Note that this encompasses both the Q and target network self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset()
if not args.env: if not args.config.get("env"): parser.error("the following arguments are required: --env") args.env = args.config.get("env") ray.init() cls = get_agent_class(args.run) agent = cls(env=args.env, config=args.config) agent.restore(args.checkpoint) num_steps = int(args.steps) if args.run == "DQN": env = gym.make(args.env) env = wrap_dqn(env, args.config.get("model", {})) else: env = ModelCatalog.get_preprocessor_as_wrapper(gym.make(args.env)) if args.out is not None: rollouts = [] steps = 0 while steps < (num_steps or steps + 1): if args.out is not None: rollout = [] state = env.reset() done = False reward_total = 0.0 while not done and steps < (num_steps or steps + 1): action = agent.compute_action(state) next_state, reward, done, _ = env.step(action) reward_total += reward
if not args.env: if not args.config.get("env"): parser.error("the following arguments are required: --env") args.env = args.config.get("env") ray.init() cls = get_agent_class(args.run) agent = cls(env=args.env, config=args.config) agent.restore(args.checkpoint) num_steps = int(args.steps) if args.run == "DQN": env = gym.make(args.env) env = wrap_dqn(get_registry(), env, args.config.get("model", {})) else: env = ModelCatalog.get_preprocessor_as_wrapper(get_registry(), gym.make(args.env)) if args.out is not None: rollouts = [] steps = 0 while steps < (num_steps or steps + 1): if args.out is not None: rollout = [] state = env.reset() done = False reward_total = 0.0 while not done and steps < (num_steps or steps + 1): action = agent.compute_action(state) next_state, reward, done, _ = env.step(action)