def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DeepQ cannot output a gym.spaces.Box action space." assert issubclass(self.policy, DeepQPolicy), "Error: the input policy for the DeepQ model must be " \ "an instance of DeepQPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, _ = deepq.build_train( q_func=self.policy, ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess) self.params = find_trainable_variables("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = deepq.build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess, full_tensorboard_log=self.full_tensorboard_log, double_q=self.double_q) self.proba_step = self.step_model.proba_step self.params = tf_util.get_trainable_vars("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/deepq/build_graph.py self.act, self.train_step, self.update_target, self.step_model = deepq.build_train( q_func=self.policy, ob_space=self.env.observation_space, ac_space=self.env.action_space, optimizer=tf.train.AdamOptimizer( learning_rate=self.learning_rate), gamma=self.gamma, # grad_norm_clipping=1, sess=self.sess) self.params = find_trainable_variables('deepq') tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) # optimizer = tf.contrib.opt.NadamOptimizer(learning_rate=self.learning_rate) # optimizer = tf.train.MomentumOptimizer(learning_rate=1e-3, momentum=0.9, use_nesterov=True) self.act, self._train_step, self.update_target, self.step_model = deepq.build_train( q_func=self.policy, ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess) self.proba_step = self.step_model.proba_step self.params = find_trainable_variables("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert isinstance(self.action_space, gym.spaces.Discrete), \ "Error: DeepQ cannot output a {} action space, only spaces.Discrete is supported."\ .format(self.action_space) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = self.observation_space def make_obs_ph(name): """ makes the observation placeholder :param name: (str) the placeholder name :return: (TensorFlow Tensor) the placeholder """ return ObservationInput(observation_space, name=name) self.act, self._train_step, self.update_target, _ = deepq.build_train( make_obs_ph=make_obs_ph, q_func=self.policy, num_actions=self.action_space.n, optimizer=tf.train.AdamOptimizer( learning_rate=self.learning_rate), gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise) self.params = find_trainable_variables("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess)
def main(args): """ Train a DQN agent on cartpole env :param args: (Parsed Arguments) the input arguments """ with tf_utils.make_session(8) as sess: # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, _ = deepq.build_train( q_func=CustomPolicy, ob_space=env.observation_space, ac_space=env.action_space, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), sess=sess) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. tf_utils.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for step in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(step))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) is_solved = step > 100 and mean_100ep_reward >= 200 if args.no_render and step > args.max_timesteps: break if is_solved: if args.no_render: break # Show off the result env.render() else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if step > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if step % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(step))) logger.dump_tabular()
def exec(self): """ Train a DQN agent on cartpole env :param args: (Parsed Arguments) the input arguments """ with tf_utils.make_session(8) as sess: # Create the environment env = self.env # Create all the functions necessary to train the model act, train, update_target, _ = deepq.build_train( q_func=CustomPolicy, ob_space=env.observation_space, ac_space=env.action_space, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), sess=sess, double_q = False, ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). solved_yet = False is_solved = False steps_so_far = 0 # Initialize the parameters and copy them to the target network. tf_utils.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for i in trange(self.episode_count): step = 0 done = False while not done: step += 1 steps_so_far += 1 if not self.mode_rbed: self.linear_decay(step=steps_so_far) # Take action and update exploration to the newest value action = act(obs[None], update_eps=self.epsilon)[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() last_reward = episode_rewards[-1] if self.mode_rbed: self.rb_decay_epsilon(current_reward=last_reward) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = sum(episode_rewards)/100 else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) # is_solved = step > 100 and mean_100ep_reward >= self.env_target # log epsilon self.ex.log_scalar(self.VAL_EPSILON, self.epsilon) # log reward self.ex.log_scalar(self.VAL_REWARD, last_reward) # log average reward self.ex.log_scalar(self.VAL_AVG100, mean_100ep_reward) # log solved at if mean_100ep_reward >= self.env_target and (not solved_yet): solved_yet = True self.ex.log_scalar(self.VAL_SOLVEDAT, i) # For next run episode_rewards.append(0) # Do not train further once solved. Keeping consistent with the original scheme if not solved_yet: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if steps_so_far > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) train( obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if steps_so_far % 1000 == 0: update_target()
def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = deepq.build_train( q_func=self.policy, ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess ) self.proba_step = self.step_model.proba_step self.params = find_trainable_variables("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all() # TODO metric self.model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=self.observation_space.shape), tf.keras.layers.Dense(256, activation=tf.nn.relu), tf.keras.layers.Dense(1, activation=tf.nn.relu) ]) self.model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error']) def mtr_train_naive(obses_beg, obses_step, obses_fin, dist): data = np.concatenate([obses_beg, obses_fin], axis=1) self.model.fit(x=data, y=dist, verbose=0) def mtr_train_step(obses_beg, obses_step, obses_fin, dist): data_step = np.concatenate([obses_step, obses_fin], axis=1) pred = self.model.predict(data_step, verbose=0).flatten() + 1 data = np.concatenate([obses_beg, obses_fin], axis=1) y = np.minimum(dist, pred*self.mtr_weight + dist*(1-self.mtr_weight)) # print(obses_beg[0], obses_step[0], obses_fin[0], dist[0], pred[0], y[0]) self.model.fit(x=data, y=y, verbose=0) def mtr_predict(data): return self.model.predict(data, verbose=0) self.mtr_train = mtr_train_step self.mtr_predict = mtr_predict