def getRewards(self): # episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() rews = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() ret = -float('nan') if len(rews) > 100: ret = np.mean(rews[-100:]) return ret
def explore(self): print('Process: %d has PID: %d' % (self.procId, os.getpid())) # # For the first run, just setup a random action. self.lastObs = self.env.reset() obs, reward, done, info = self.env.step(0) self.retFrame.copy_(torch.from_numpy(obs)) # self.com.send(( reward, done, 0, 0, 0)) self.lastObs = obs self.retFrame.copy_(torch.from_numpy(obs)) self.reward.copy_(torch.from_numpy(np.atleast_1d(reward))) self.done.copy_(torch.from_numpy(np.atleast_1d(done).astype(np.uint8))) self.action.copy_(torch.from_numpy(np.atleast_1d(0))) self.meanRewards.copy_(torch.from_numpy(np.atleast_1d(-float('nan')))) self.nEps.copy_(torch.from_numpy(np.atleast_1d(0))) # # Notify that remembory is ready. self.barrier.wait() # self.com.send(0) minEp = 100 // self.cfg.numEnv # # Loop and do work. while True: # # Wait for actions. step = self.com.recv() action = self.actionVec.clone().numpy().astype( np.int64)[self.procId] obs, reward, done, info = self.env.step(action) # # Step and save transition. if done: obs = self.env.reset() # # Store effects. lastRew = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() mean_episode_reward = -float('nan') if (len(lastRew) > minEp): mean_episode_reward = np.mean(lastRew[-minEp:]) # self.com.send(( reward, done, action, mean_episode_reward, len(lastRew))) self.lastObs = obs self.retFrame.copy_(torch.from_numpy(self.lastObs)) self.reward.copy_(torch.from_numpy(np.atleast_1d(reward))) self.done.copy_( torch.from_numpy(np.atleast_1d(done).astype(np.uint8))) self.action.copy_(torch.from_numpy(np.atleast_1d(action))) self.meanRewards.copy_( torch.from_numpy(np.atleast_1d(mean_episode_reward))) self.nEps.copy_(torch.from_numpy(np.atleast_1d(len(lastRew)))) # # Notify that remembory is ready. self.barrier.wait()
def log_progress(self): episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) if self.t % self.log_every_n_steps == 0 and self.model_initialized: logz.log_tabular("TimeStep", self.t) logz.log_tabular("MeanReturn", self.mean_episode_reward) logz.log_tabular("BestMeanReturn", max(self.best_mean_episode_reward, self.mean_episode_reward)) logz.log_tabular("Episodes", len(episode_rewards)) logz.log_tabular("Exploration", self.exploration.value(self.t)) logz.log_tabular("LearningRate", self.optimizer_spec.lr_lambda(self.t)) logz.log_tabular("Time", (time.time() - self.start_time) / 60.) logz.dump_tabular() logz.save_pytorch_model(self.q_net)
def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
def learn(env, q_func, optimizer_spec, session, exploration=dqn_utils.LinearSchedule(1000000, 0.1), stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: img_in: tf.Tensor tensorflow tensor representing the input image num_actions: int number of actions scope: str scope in which all the model related variables should be created reuse: bool whether previously created variables should be reused. optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer session: tf.Session tensorflow session to use. exploration: rl_algs.deepq.utils.schedules.Schedule schedule for probability of chosing random action. stopping_criterion: (env, t) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network grad_norm_clipping: float or None If not None gradients' norms are clipped to this value. """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_shape = env.observation_space.shape else: img_h, img_w, img_c = env.observation_space.shape input_shape = (img_h, img_w, frame_history_len * img_c) num_actions = env.action_space.n # set up placeholders # placeholder for current observation (or state) obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) # placeholder for current action act_t_ph = tf.placeholder(tf.int32, [None]) # placeholder for current reward rew_t_ph = tf.placeholder(tf.float32, [None]) # placeholder for next observation (or state) obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) # placeholder for end of episode mask # this value is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target, not the # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) done_mask_ph = tf.placeholder(tf.float32, [None]) # casting to float on GPU ensures lower data transfer times. obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0 obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0 # Here, you should fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # TensorFlow will differentiate this error for you, you just need to pass it to the # optimizer. See assignment text for details. # Your code should produce one scalar-valued tensor: total_error # This will be passed to the optimizer in the provided code below. # Your code should also produce two collections of variables: # q_func_vars # target_q_func_vars # These should hold all of the variables of the Q-function network and target network, # respectively. A convenient way to get these is to make use of TF's "scope" feature. # For example, you can create your Q-function network with the scope "q_func" like this: # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) # And then you can obtain the variables like this: # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" ###### # YOUR CODE HERE ###### Qfunc = q_func(obs_t_float, num_actions, scope='Qfunc') q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Qfunc') Qtarget = q_func(obs_tp1_float, num_actions, scope='Qtarget') target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Qtarget') #total_error = (y - Q)^2 # y = r + gamma * max_a' Qtarget(S_t+1, a') q_tp1 = tf.reduce_max(Qtarget) y = rew_t_ph + (1. - done_mask_ph) * (gamma * q_tp1) #total_error = tf.reduce_sum ( tf.square (y - Qfunc[act_t_ph]) ) actions_onehot = tf.one_hot(act_t_ph, num_actions, dtype=tf.float32) Qfunc_action_t = tf.reduce_sum(tf.multiply(Qfunc, actions_onehot), axis=1) total_error = tf.reduce_sum(tf.square(y - Qfunc_action_t)) # --------------- # # construct optimization op (with gradient clipping) learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs) train_fn = minimize_and_clip(optimizer, total_error, var_list=q_func_vars, clip_val=grad_norm_clipping) # update_target_fn will be called periodically to copy Q network to target Q network update_target_fn = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_fn.append(var_target.assign(var)) update_target_fn = tf.group(*update_target_fn) # construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) print('## replay_buffer: size={}, frame_history_len={}'.format( replay_buffer_size, frame_history_len)) ############### # RUN ENV # ############### model_initialized = False num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 episode_count = 1 for t in itertools.count( ): # itertools.count(n) generates an infinite iterator starting from n. ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env, t): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### # YOUR CODE HERE ##### last_obs_indx = replay_buffer.store_frame(last_obs) action = 0 p_random = exploration.value(t) # sample exploration probability if np.random.rand() < p_random \ or replay_buffer.can_sample(batch_size)==False\ or t <= learning_starts: action = env.action_space.sample() # random action else: # exploration otph = replay_buffer.encode_recent_observation() action = session.run(tf.argmax(Qfunc), feed_dict={obs_t_ph: otph}) obstp1, reward, done, info = env.step(action) replay_buffer.store_effect(last_obs_indx, action, reward, done) episode_count += 1 if done: env.reset() print('## done {} with reward={}'.format(episode_count, reward)) episode_count = 0 last_obs = obstp1 # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): print('### 3. Perform experience replay and train the network.') # Here, you should perform training. Training consists of four steps: # 3.a: # use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). print(' 3.a replay_buffer.sample({})'.format(batch_size)) bobs, baction, brew, bnobs, bdone = replay_buffer.sample( batch_size) # 3.b: # initialize the model if it has not been initialized yet; to do # that, call # initialize_interdependent_variables(session, tf.global_variables(), { # obs_t_ph: obs_t_batch, # obs_tp1_ph: obs_tp1_batch, # }) # where obs_t_batch and obs_tp1_batch are the batches of observations at # the current and next time step. The boolean variable model_initialized # indicates whether or not the model has been initialized. # Remember that you have to update the target network too (see 3.d)! # defined in dqn_utils.py #if model_initialized == False: # model_initialized = True initialize_interdependent_variables(session, tf.global_variables(), { obs_t_ph: bobs, obs_tp1_ph: bnobs }) print(' 3.b initialize_interdependent_variables().') # 3.c: # train the model. To do this, you'll need to use the train_fn and # total_error ops that were created earlier: total_error is what you # created to compute the total Bellman error in a batch, and train_fn # will actually perform a gradient step and update the network parameters # to reduce total_error. When calling session.run on these you'll need to # populate the following placeholders: # obs_t_ph # act_t_ph # rew_t_ph # obs_tp1_ph # done_mask_ph # (this is needed for computing total_error) # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t) # (this is needed by the optimizer to choose the learning rate) print(' 3.c train the model.') lr = optimizer_spec.lr_schedule.value(t) feed_dict = { obs_t_ph: bobs, act_t_ph: baction, rew_t_ph: brew, obs_tp1_ph: bnobs, done_mask_ph: bdone, learning_rate: lr } session.run(train_fn, feed_dict=feed_dict) # 3.d: # periodically update the target network by calling # session.run(update_target_fn) # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### # YOUR CODE HERE ##### if t % target_update_freq == 0: print(' 3.d periodically update the target network.') session.run(update_target_fn) num_param_updates += 1 print('## Qtarget updated {} times.'.format(num_param_updates)) ### 4. Log progress episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and model_initialized: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) print("learning_rate %f" % optimizer_spec.lr_schedule.value(t)) sys.stdout.flush() # return # eof
def getNumEps(self): return len( get_wrapper_by_name(self.env, "Monitor").get_episode_rewards())
def learn(env, q_func, initialize_model: Callable[[Tuple, int], Dict], batch_size=32, exploration=LinearSchedule(1000000, 0.1), frame_history_len: int = 4, gamma: float = 0.99, learning_starts=50000, lr_schedule=LinearSchedule(1000000, 0.1), learning_freq=4, replay_buffer_size: int = 1000000, start_time=time.time(), stopping_criterion: Callable[[wrappers.Monitor, int], bool] = None, target_update_freq=10000, checkpoint_dir='./checkpoints', grad_norm_clipping=10): """Train a two-layer neural network. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Structured after github.com/alvinwan/deep-q-learning Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: img_in: tf.Tensor tensorflow tensor representing the input image num_actions: int number of actions scope: str scope in which all the model related variables should be created reuse: bool whether previously created variables should be reused. exploration: rl_algs.deepq.utils.schedules.Schedule schedule for probability of chosing random action. stopping_criterion: (env, t) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay lr_schedule: rl_algs.deepq.utils.schedules.Schedule schedule for learning rate. frame_history_len: int How many past frames to include as input to the model. start_time: datetime The time of training start target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### img_h, img_w, img_c = env.observation_space.shape input_shape = (img_h, img_w, frame_history_len * img_c) num_actions = env.action_space.n # construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) def update_target_func(model_curr: Dict, model_target: Dict): model_curr.update(model_target) def train_func(obs_t: np.ndarray, act_t: np.ndarray, rew_t: np.ndarray, obs_tp1: np.ndarray, done_mask: np.ndarray, learning_rate: float, model_curr: Dict, model_target: Dict) -> Dict: """Train function, minimizing loss per q-learning objective. This assumes the q_function is a one-layer fc neural network, where the loss function is squared error. """ curr_q = q_func(obs_t, model_curr) target_q = q_func(obs_tp1, model_target) actions = one_hot(act_t, num_actions) q_target_max = np.max(target_q, axis=1) q_target_val = rew_t + gamma * (1. - done_mask) * q_target_max q_candidate_val = np.sum(curr_q * actions, axis=1) _ = sum((q_target_val - q_candidate_val)**2) d = obs_t.shape[1] * obs_t.shape[2] * obs_t.shape[3] obs_t = obs_t.reshape((-1, d)) loss_gradient = -2 * (q_target_val - q_candidate_val) x_loss_gradient = obs_t.T * loss_gradient gradient = x_loss_gradient.dot(actions) clipped_gradient = clip_by_norm(gradient, grad_norm_clipping) model_curr['W0'] += learning_rate * clipped_gradient return model_curr ########### # RUN ENV # ########### model_initialized = False num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 learning_rate = exploration.value(0) model_curr = {} model_target = {} run_id = str(start_time)[-5:].replace('.', '') os.makedirs(os.path.join(checkpoint_dir, run_id), exist_ok=True) for t in itertools.count(): # 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env, t): break # 2. Step the env and store the transition t_obs_idx = replay_buffer.store_frame(last_obs) if np.random.random() < exploration.value(t) \ or not model_initialized \ or not replay_buffer.can_sample(batch_size): action = env.action_space.sample() else: r_obs = replay_buffer.encode_recent_observation()[np.newaxis, ...] curr_q_eval = q_func(r_obs, model_curr) action = np.argmax(curr_q_eval) last_obs, reward, done, info = env.step(action) replay_buffer.store_effect(t_obs_idx, action, reward, done) if done: last_obs = env.reset() # 3. Perform experience relay and train the network. if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): obs_t, act_t, rew_t, obs_tp1, done_mask = \ replay_buffer.sample(batch_size) if not model_initialized: model_initialized = True model_curr = initialize_model(input_shape, num_actions) model_target = model_curr learning_rate = lr_schedule.value(t) model_curr = train_func(obs_t=obs_t, act_t=act_t, rew_t=rew_t, obs_tp1=obs_tp1, done_mask=done_mask, learning_rate=learning_rate, model_curr=model_curr, model_target=model_target) if t % target_update_freq == 0: update_target_func(model_curr, model_target) num_param_updates += 1 # 4. Log progress episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and model_initialized: if start_time is not None: print("Time %s s" % int(time.time() - start_time)) start_time = time.time() print("Timestep %d" % t) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) print("learning_rate %f" % learning_rate) sys.stdout.flush() scipy.io.savemat( os.path.join(checkpoint_dir, run_id, 'step-%d.mat' % t), model_curr) scipy.io.savemat(os.path.join(checkpoint_dir, run_id, 'step-final.mat'), model_curr) return model_curr
def main(): arguments = docopt.docopt(__doc__) # Run training seed = int(str(time.time())[-5:]) env = get_custom_env(arguments['--envid'], seed) n_episodes = int(arguments['--n_episodes']) save_path = arguments['--save_path'] logdir = arguments['--logdir'] os.makedirs(logdir, exist_ok=True) num_actions = env.action_space.n img_h, img_w, img_c = env.observation_space.shape frame_history_len = 4 replay_buffer_size = 1000000 input_shape = (img_h, img_w, frame_history_len * img_c) num_timesteps = 40000000 def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps with get_session() as session: # set up placeholders # placeholder for current observation (or state) obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) # casting to float on GPU ensures lower data transfer times. obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0 global_vars = tf.GraphKeys.GLOBAL_VARIABLES curr_q = atari_model(obs_t_float, num_actions, scope='q_func') obs_sars = [] saver = tf.train.Saver() saver.restore(session, save_path) print(' * Restore from', save_path) # construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) last_obs = env.reset() for i in range(n_episodes): episode_reward = 0 j = 0 while True: t_obs_idx = replay_buffer.store_frame(last_obs) r_obs = replay_buffer.encode_recent_observation()[np.newaxis, ...] curr_q_eval = session.run([curr_q], {obs_t_ph: r_obs}) action = np.argmax(curr_q_eval) last_obs, reward, done, info = env.step(action) episode_reward += reward replay_buffer.store_effect(t_obs_idx, action, reward, done) obs_sars.append( np.hstack((last_obs.reshape((1, -1)), action.reshape( (1, 1)), np.array([reward]).reshape((1, 1))))) if done: j += 1 last_obs = env.reset() episode_rewards = get_wrapper_by_name( env, 'Monitor').get_episode_rewards() if episode_rewards: episode_reward = episode_rewards[-1] if episode_reward < 0: print(' * Reward too low (%d)... resetting.' % episode_reward) obs_sars = [] else: break print(' * Episode %d with reward %d' % (i, episode_reward)) write_sar_log(obs_sars, logdir, episode_reward) obs_sars = []