def evaluate(self, env=None, num_episodes=None): """ Evaluation with same procedure as the training """ # log our activity only if default call if num_episodes is None: self.logger.info("Evaluating...") # arguments defaults if num_episodes is None: num_episodes = self.config.num_episodes_test if env is None: env = self.env # replay memory to play replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = [] for i in range(num_episodes): total_reward = 0 state = env.reset() state = state.reshape([1, -1, 1]) while True: if self.config.render_test: env.render() # store last state in buffer idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() action = self.get_action(q_input) # perform action in env new_state, reward, done, info = env.step(action) # store in replay memory replay_buffer.store_effect(idx, action, reward, done) state = new_state state = state.reshape([1, -1, 1]) # count reward total_reward += reward if done: break # updates to perform at the end of an episode rewards.append(total_reward) avg_reward = np.mean(rewards) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) if num_episodes >= 1: msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) self.logger.info(msg) return avg_reward
def run_episode( env, q_func, replay_buffer_size=1000000, frame_history_len=4, game=None, ): assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete if len(env.observation_space.shape) == 1: input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n Q = q_func(input_arg, num_actions).type(dtype) Q.load_state_dict(torch.load("./models/PAL_{}.pth".format(game), map_location=lambda storage, loc: storage)) replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) all_obs = [] last_obs = env.reset() for t in count(): last_idx = replay_buffer.store_frame(last_obs) recent_observations = replay_buffer.encode_recent_observation() all_obs.append(recent_observations) torch_obs = torch.from_numpy(recent_observations).type(dtype).unsqueeze(0) / 255.0 with torch.no_grad(): Qvals = Q(torch_obs).data[0] max2val, max2idx = Qvals.topk(2) action = max2idx[0] obs, reward, done, _ = env.step(action) env.render() replay_buffer.store_effect(last_idx, action, reward, done) if done: break last_obs = obs return all_obs
def evaluate(self, env, num_episodes): replay_buffer = ReplayBuffer(self.FLAGS.state_hist, self.FLAGS.state_hist) rewards = [] if num_episodes > 1: self.logger.info("Evaluating...") for i in range(num_episodes): total_reward = 0 state = env.reset() while True: # Store last state in buffer idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # Get greedy action action = self.network.get_best_action(q_input)[0] # Perform action in env new_state, reward, done, info = env.step(action) # Store in replay memory replay_buffer.store_effect(idx, action, reward, done) state = new_state # count reward total_reward += reward if done: break # updates to perform at the end of an episode rewards.append(total_reward) avg_reward = np.mean(rewards) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) if num_episodes > 1: msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) self.logger.info(msg) return avg_reward
def playPoint(expert, state): experts_replay_buffer = ReplayBuffer( config.buffer_size, config.state_history) counter = 0 initial_action = -1 while True: idx = experts_replay_buffer.store_frame(state) q_input = experts_replay_buffer.encode_recent_observation() action, _ = expert.get_best_action(q_input) if counter == 0: initial_action = action # perform action in env new_state, reward, done, info = env.step(action) # store in replay memory state = new_state experts_replay_buffer.store_effect(idx, action, reward, done) # count reward if abs(reward) == 1: break counter += 1 print("PLAY POINT ENDED") return (config.gamma**counter) * reward, initial_action
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): return model(Variable(obs)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function, i.e. build the model. ###### # YOUR CODE HERE Q = q_func(input_arg, num_actions) Q_target = q_func(input_arg, num_actions) ###### # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### idx = replay_buffer.store_frame(last_obs) encoded_obs = replay_buffer.encode_recent_observation() if (t > learning_starts): action = select_epilson_greedy_action(Q, encoded_obs, t) else: action = random.randrange(num_actions) obs, reward, done, _ = env.step(action) replay_buffer.store_effect(idx, action, reward, done) if (done): last_obs = env.reset() else: last_obs = obs ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### # YOUR CODE HERE # # Alpha (learning rate) from the q function update isn't present in our code -- its in OptimizerSpec in main. # Move to GPU if possible # done flag in loop ---- SKIPPED IF DONE IS TRUE # clipping the error between -1 and 1 -- OK # backward the error meaning? # Suggestion for changing parameters - change exploration scehdule (main) # # Q.cuda() obs_batch, act_batch, reward_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size=batch_size) states = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) actions = Variable(torch.from_numpy(act_batch).long()) rewards = Variable(torch.from_numpy(reward_batch).float()) next_states = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_dones = Variable(torch.from_numpy(1 - done_mask).type(dtype)) if USE_CUDA: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() next_states = next_states.cuda() Q.train() Q_target.eval() predicted_rewards = Q(states).gather(1, actions.unsqueeze(1)) #Q(s,a) next_max_Q = Q_target(next_states).detach().max(1)[ 0] #.unsqueeze(1) #Q_target(s,a) next_Q_values = not_dones * next_max_Q target_Q_values = rewards + (gamma * next_Q_values) #r + Q_target bellman_error = target_Q_values - predicted_rewards.squeeze(1) clipped_bellman_error = bellman_error.clamp(-1, 1) * (-1.0) optimizer.zero_grad() predicted_rewards.backward(clipped_bellman_error.data.unsqueeze(1)) optimizer.step() num_param_updates += +1 if (num_param_updates % target_update_freq == 0): Q_target.load_state_dict(Q.state_dict()) # for obs,act,reward,next_obs,done in zip(obs_batch,act_batch,reward_batch,next_obs_batch,done_mask): # if(done == 1.0): # continue # obs = Variable(torch.from_numpy(obs, ).type(dtype).unsqueeze(0) / 255.0, requires_grad=True) # next_obs = Variable(torch.from_numpy(next_obs).type(dtype).unsqueeze(0) / 255.0, requires_grad=False) # current_Q = Q(obs) # predicted_reward = Variable(current_Q[0][act].unsqueeze(0), requires_grad=True) # target_reward = Q_target(next_obs).data.max(1)[0] # loss = loss_fn(reward + gamma * target_reward, predicted_reward).clamp(-1, 1) * (-1.0) # optimizer.zero_grad() # # should be current.backward(d_error.data.unsqueeze(1)) # # but it crashes on misfitting dims # predicted_reward.backward(loss.data.unsqueeze(1)) # optimizer.step() ##### ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: total_reward = 0 #state = self.env.reset() state = self.env.env_reset() while True: # print t t += 1 last_eval += 1 # print total_reward #if self.config.render_train: self.env.render() # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env #print t # TODO: log displays to tensorboard new_state, reward, done, info = self.env.env_step( action, state) #, display=(t % DISPLAY_FREQ == 0)) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # print t > self.config.learning_start # print t % self.config.log_freq == 0 # print t % self.config.learning_freq == 0 #print rewards # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: self.update_averages(rewards, max_q_values, q_values, scores_eval) prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
class DQNAgent(BaseAgent): non_terminal_reward = 0 def __init__(self, env, config, exp_schedule, lr_schedule, is_training_agent, train_from_scratch=False, reward_after_somebody_died=False, logger=None): """ Initialize Q Network and env :param env: Game environment :param config: config(hyper-parameters) instance :param logger: logger instance from logging module :param exp_schedule: exploration strategy for epsilon :param lr_schedule: schedule for learning rate """ super(DQNAgent, self).__init__() # Variables initialized in _build self._states = None self._actions = None self._rewards = None self._next_states = None self._done_mask = None self._learning_rate = None self._q_values = None self._target_q_values = None self._next_q_values = None self._update_target_op = None self._loss = None self._train_op = None self._grad_norm = None # Variables initialized in init_agent self._session = None self._avg_reward_placeholder = None self._max_reward_placeholder = None self._std_reward_placeholder = None self._avg_q_placeholder = None self._max_q_placeholder = None self._std_q_placeholder = None # TODO: Commented due to lack of evaluate() # self._eval_reward_placeholder = None self._merged = None self._file_writer = None self._saver = None self._train_replay_buffer = None self._train_rewards = None self._train_max_q_values = None self._train_q_values = None self._avg_reward = None self._max_reward = None self._std_reward = None self._avg_q = None self._max_q = None self._std_q = None # TODO: Commented due to lack of evaluate() # self._eval_reward = None self._time_step = None self._progress_bar = None self._has_episode_started = None # Variables initialized in act. self._last_action = None self._last_idx = None self._enemy_count = None # Directory for training outputs if not os.path.exists(config.output_path): os.makedirs(config.output_path) self._logger = logger if logger is None: self._logger = get_logger(config.log_path) self._config = config self._env = env self._exp_schedule = exp_schedule self._lr_schedule = lr_schedule self._is_training_agent = is_training_agent self._train_from_scratch = train_from_scratch self._reward_after_somebody_died = reward_after_somebody_died self._total_reward = 0 # Build model. self._build() def init_agent(self, id_, game_type): super(DQNAgent, self).init_agent(id_, game_type) # Assume the graph has been constructed. # Create a tf Session and run initializer of variables. tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True self._session = tf.Session(config=tf_config) # Tensorboard self._add_summary() # Initialize all variables. init = tf.global_variables_initializer() self._session.run(init) # Synchronise q and target_q networks. self._session.run(self._update_target_op) # for saving networks weights self._saver = tf.train.Saver() # Initialize replay buffer and variables. self._train_replay_buffer = ReplayBuffer(self._config.buffer_size, self._config.state_history) self._train_rewards = deque(maxlen=self._config.num_episodes_test) self._train_max_q_values = deque(maxlen=1000) self._train_q_values = deque(maxlen=1000) self._init_averages() self._time_step = 0 self._progress_bar = Progbar(target=self._config.nsteps_train) self._has_episode_started = False if not self._train_from_scratch: self._load() def act(self, obs, action_space): state = obs['board'][:, :, None] if not self._is_training_agent: # Act greedily when testing. if self._has_episode_started: self._train_replay_buffer.store_effect( self._last_idx, self._last_action, 0, done=False ) self._last_idx = self._train_replay_buffer.store_frame(state) q_input = self._train_replay_buffer.encode_recent_observation() action = self._get_action(q_input) self._last_action = action return action if self._has_episode_started: reward = DQNAgent.non_terminal_reward if self._reward_after_somebody_died: if len(self._character.enemies) < self._enemy_count: reward = 1 self._train(reward, done=False) self._enemy_count = len(self._character.enemies) self._time_step += 1 # Replay buffer idx = self._train_replay_buffer.store_frame(state) q_input = self._train_replay_buffer.encode_recent_observation() # Choose action according to current Q and exploration best_action, self._train_q_values = self._get_best_action(q_input) action = self._exp_schedule.get_action(best_action) self._train_max_q_values.append(max(self._train_q_values)) self._train_q_values += list(self._train_q_values) self._last_action = action self._last_idx = idx if not self._has_episode_started: self._has_episode_started = True return action def episode_end(self, reward): """ Updates to perform at the end of an episode """ # Reset episode. self._has_episode_started = False if not self._is_training_agent: return self._train(reward, done=True) self._train_rewards.append(self._total_reward) # Reset total reward. self._total_reward = 0 # TODO: Commented due to lack of evaluate() and record() # if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # # evaluate our policy # last_eval = 0 # print("") # scores_eval += [self.evaluate()] # # if (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq): # self.logger.info("Recording...") # last_record = 0 # self.record() def shutdown(self): """ Save trained results """ if not self._is_training_agent: return self._logger.info("- Training done.") self._save() # TODO: Commented due to lack of evaluate() # scores_eval += [self.evaluate()] # DQNAgent.export_plot(scores_eval, "Scores", self.config.plot_output) def _train(self, reward, done): # Store the transition. self._train_replay_buffer.store_effect( self._last_idx, self._last_action, reward, done=done ) # Perform a training step. loss_eval, grad_eval = self._train_step( self._time_step, self._train_replay_buffer, self._lr_schedule.epsilon ) # Logging if self._time_step > self._config.learning_start \ and self._time_step % self._config.log_freq == 0 \ and self._time_step % self._config.learning_freq == 0: self._update_averages(self._train_rewards, self._train_max_q_values, self._train_q_values) self._exp_schedule.update(self._time_step) self._lr_schedule.update(self._time_step) if len(self._train_rewards) > 0: self._progress_bar.update( self._time_step + 1, exact=[ ("Loss", loss_eval), ("Avg R", self._avg_reward), ("Max R", np.max(self._train_rewards)), ("eps", self._exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self._max_q), ("lr", self._lr_schedule.epsilon) ] ) elif self._time_step < self._config.learning_start and self._time_step % self._config.log_freq == 0: sys.stdout.write("\rPopulating the memory {}/{}...".format(self._time_step, self._config.learning_start)) sys.stdout.flush() # Accumulate reward self._total_reward += reward def _build(self): """ Build model by adding all necessary variables. """ # Add placeholders. self._add_placeholders_op() # Compute Q values of state. states = self._process_state(self._states) self._q_values = self._get_q_values_op(states, scope='q', reuse=False) # Compute Q values of next state. next_states = self._process_state(self._next_states) self._target_q_values = self._get_q_values_op(next_states, scope='target_q', reuse=False) # for Double DQN self._next_q_values = self._get_q_values_op(next_states, scope='q', reuse=True) # Add update operator for target network. self._add_update_target_op('q', 'target_q') # Add square loss. self._add_loss_op(self._q_values, self._target_q_values, self._next_q_values) # Add optimizer for the main networks. self._add_optimizer_op('q') def _add_placeholders_op(self): """ Adds placeholders to the graph These placeholders are used as inputs by the rest of the model building and will be fed data during training. Note that when "None" is in a placeholder's shape, it's flexible (so we can use different batch sizes without rebuilding the model """ state_shape = list(self._env.observation_space.shape) self._states = tf.placeholder(tf.uint8, (None, 11, 11, self._config.state_history)) self._actions = tf.placeholder(tf.int32, (None,)) self._rewards = tf.placeholder(tf.float32, (None,)) self._next_states = tf.placeholder(tf.uint8, (None, 11, 11, self._config.state_history)) self._done_mask = tf.placeholder(tf.bool, (None,)) self._learning_rate = tf.placeholder(tf.float32, ()) def _process_state(self, state): """ Processing of state State placeholders are tf.uint8 for fast transfer to GPU Need to cast it to float32 for the rest of the tf graph. :param state: Node of tf graph of shape = (batch_size, height, width, nchannels) of type tf.uint8.if, values are between 0 and 255 -> 0 and 1 """ state = tf.cast(state, tf.float32) state /= self._config.high return state def _get_q_values_op(self, state, scope, reuse=False): """ Returns Q values for all actions :param state: (tf tensor) shape = (batch_size, img height, img width, nchannels) :param scope: (string) scope name, that specifies if target network or not :param reuse: (bool) reuse of variables in the scope :return out: (tf tensor) of shape = (batch_size, num_actions) """ num_actions = self._env.action_space.n out = state with tf.variable_scope(scope, reuse=reuse) as _: x = layers.conv2d(state, 32, 5, stride=2, padding='SAME') x = layers.conv2d(x, 64, 4, stride=2, padding='SAME') x = layers.conv2d(x, 64, 3, stride=1, padding='SAME') x = layers.flatten(x) x = layers.fully_connected(x, 512) out = layers.fully_connected(x, num_actions, activation_fn=None) return out def _add_update_target_op(self, q_scope, target_q_scope): """ update_target_op will be called periodically to copy Q network weights to target Q network Remember that in DQN, we maintain two identical Q networks with 2 different set of weights. In tensorflow, we distinguish them with two different scopes. One for the target network, one for the regular network. If you're not familiar with the scope mechanism in tensorflow, read the docs https://www.tensorflow.org/programmers_guide/variable_scope Periodically, we need to update all the weights of the Q network and assign them with the values from the regular network. Thus, what we need to do is to build a tf op, that, when called, will assign all variables in the target network scope with the values of the corresponding variables of the regular network scope. :param q_scope: (string) name of the scope of variables for q :param target_q_scope: (string) name of the scope of variables for the target network """ tar_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=target_q_scope) q_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=q_scope) self._update_target_op = tf.group(*[tf.assign(tar_vars[i], q_vars[i]) for i in range(len(tar_vars))]) def _add_loss_op(self, q, target_q, next_q): """ Sets the loss of a batch, self.loss is a scalar :param q: (tf tensor) shape = (batch_size, num_actions)(Q(s, a)) :param target_q: (tf tensor) shape = (batch_size, num_actions)(Q_target(s', a')) :param next_q: Q(s', a') for Double DQN """ num_actions = self._env.action_space.n not_done = 1 - tf.cast(self._done_mask, tf.float32) # Double DQN # need q_next(Q(s', a')), then find argmax in it max_a = tf.argmax(next_q, axis=1) q_max = tf.reduce_sum(target_q * tf.one_hot(max_a, num_actions), axis=1) q_samp = self._rewards + not_done * self._config.gamma * q_max # nature DQN q_s = tf.reduce_sum(q * tf.one_hot(self._actions, num_actions), axis=1) self._loss = tf.reduce_mean(tf.square(q_samp - q_s)) def _add_optimizer_op(self, scope): """ Set self.train_op and self.grad_norm """ optimizer = tf.train.AdamOptimizer(self._learning_rate) vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) grads_and_vars = optimizer.compute_gradients(self._loss, vars) clip_grads_and_vars = None if self._config.grad_clip: clip_grads_and_vars = [(tf.clip_by_norm(gv[0], self._config.clip_val), gv[1]) for gv in grads_and_vars] self._train_op = optimizer.apply_gradients(clip_grads_and_vars) self._grad_norm = tf.global_norm(clip_grads_and_vars) def _add_summary(self): """ Tensorflow stuff """ # extra placeholders to log stuff from python self._avg_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_reward") self._max_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="max_reward") self._std_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="std_reward") self._avg_q_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_q") self._max_q_placeholder = tf.placeholder(tf.float32, shape=(), name="max_q") self._std_q_placeholder = tf.placeholder(tf.float32, shape=(), name="std_q") # TODO: Commented due to lack of evaluate() # self._eval_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="eval_reward") # add placeholders from the graph tf.summary.scalar("loss", self._loss) tf.summary.scalar("grads norm", self._grad_norm) # extra summaries from python -> placeholders tf.summary.scalar("Avg Reward", self._avg_reward_placeholder) tf.summary.scalar("Max Reward", self._max_reward_placeholder) tf.summary.scalar("Std Reward", self._std_reward_placeholder) tf.summary.scalar("Avg Q", self._avg_q_placeholder) tf.summary.scalar("Max Q", self._max_q_placeholder) tf.summary.scalar("Std Q", self._std_q_placeholder) # TODO: Commented due to lack of evaluate() # tf.summary.scalar("Eval Reward", self._eval_reward_placeholder) # logging self._merged = tf.summary.merge_all() self._file_writer = tf.summary.FileWriter(self._config.output_path, self._session.graph) def _init_averages(self): """ Define extra attributes for tensorboard. """ self._avg_reward = -21. self._max_reward = -21. self._std_reward = 0 self._avg_q = 0 self._max_q = 0 self._std_q = 0 # TODO: Commented due to lack of evaluate() # self._eval_reward = -21. def _get_action(self, obs): """ Returns action with some epsilon strategy :param obs: observation from gym """ if np.random.random() < self._config.soft_epsilon: return self._env.action_space.sample() else: return self._get_best_action(obs)[0] def _get_best_action(self, obs): """ Return best action :param obs: 4 consecutive observations from gym :return action: (int) :return action_values: (np array) q values for all actions """ action_values = self._session.run(self._q_values, feed_dict={self._states: [obs]})[0] return np.argmax(action_values), action_values def _train_step(self, t, replay_buffer, lr): """ Perform training step :param t: (int) nth step :param replay_buffer: buffer for sampling :param lr: (float) learning rate """ loss_eval, grad_eval = 0, 0 # Perform training step if t > self._config.learning_start and t % self._config.learning_freq == 0: loss_eval, grad_eval = self._update_step(t, replay_buffer, lr) # Occasionally update target network with q network if t % self._config.target_update_freq == 0: self._update_target_params() # Occasionally save the weights if t % self._config.saving_freq == 0: self._save() return loss_eval, grad_eval def _update_step(self, t, replay_buffer, lr): """ Performs an update of parameters by sampling from replay_buffer :param t: number of iteration (episode and move) :param replay_buffer: ReplayBuffer instance .sample() gives batches :param lr: (float) learning rate :return loss: (Q - Q_target) ^ 2 """ s_batch, a_batch, r_batch, sp_batch, done_mask_batch = replay_buffer.sample(self._config.batch_size) fd = { # Inputs self._states: s_batch, self._actions: a_batch, self._rewards: r_batch, self._next_states: sp_batch, self._done_mask: done_mask_batch, self._learning_rate: lr, # Extra info self._avg_reward_placeholder: self._avg_reward, self._max_reward_placeholder: self._max_reward, self._std_reward_placeholder: self._std_reward, self._avg_q_placeholder: self._avg_q, self._max_q_placeholder: self._max_q, self._std_q_placeholder: self._std_q, # TODO: Commented due to lack of evaluate() # self._eval_reward_placeholder: self.eval_reward, } loss_eval, grad_norm_eval, summary, _ = self._session.run( [self._loss, self._grad_norm, self._merged, self._train_op], feed_dict=fd ) # Tensorboard self._file_writer.add_summary(summary, t) return loss_eval, grad_norm_eval def _update_target_params(self): """ Update parameters of Q with parameters of Q """ self._session.run(self._update_target_op) def _load(self): """ Loads session """ ckpt = tf.train.get_checkpoint_state(self._config.model_output) self._saver.restore(self._session, ckpt.model_checkpoint_path) def _save(self): """ Saves session """ if not os.path.exists(self._config.model_output): os.makedirs(self._config.model_output) model_path = os.path.join(self._config.model_output, 'model.ckpt') self._saver.save(self._session, model_path) def _update_averages(self, rewards, max_q_values, q_values, scores_eval=None): """ Update the averages :param rewards: deque :param max_q_values: deque :param q_values: deque :param scores_eval: list """ self._avg_reward = np.mean(rewards) self._max_reward = np.max(rewards) self._std_reward = np.sqrt(np.var(rewards) / len(rewards)) self._max_q = np.mean(max_q_values) self._avg_q = np.mean(q_values) self._std_q = np.sqrt(np.var(q_values) / len(q_values)) # TODO: Commented due to lack of evaluate() # if len(scores_eval) > 0: # self.eval_reward = scores_eval[-1] @staticmethod def export_plot(y, y_label, filename): """ Export a plot in filename :param y: (list) of float / int to plot :param filename: (string) directory """ plt.figure() plt.plot(range(len(y)), y) plt.xlabel("Epoch") plt.ylabel(y_label) plt.savefig(filename) plt.close()
def dqn_learing( env, q_func, checkpoint_path, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000 ): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. dont save the history return model(Variable(obs, volatile=True)).data.max(1)[1].view(1,1) else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) # optionally resume from a checkpoint if checkpoint_path: if os.path.isfile(checkpoint_path): print("=> loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path) Q.load_state_dict(checkpoint['model_state_dict']) target_Q.load_state_dict(checkpoint['target_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}')".format(checkpoint_path)) else: print("=> no checkpoint found at '{}'".format(checkpoint_path)) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 SAVE_EVERY_N_STEPS = 1000 episode_reward = 0 episode_rewards = [] for t in count(): ### Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0][0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(action) print("reward: %f" % reward) # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: episode_reward = 0 obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)).squeeze() # squeeze the [batch_size x 1] Tensor to have a shape of batch_size # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (gamma * next_Q_values) # # Compute Bellman error # bellman_error = target_Q_values - current_Q_values # # clip the bellman error between [-1 , 1] # clipped_bellman_error = bellman_error.clamp(-1, 1) # # Note: clipped_bellman_delta * -1 will be right gradient # d_error = clipped_bellman_error * -1.0 # Compute Huber loss. Why not MSE? Because, Huber Loss is robust to noisy Q estimates compared to plain MSE. loss = F.smooth_l1_loss(current_Q_values, target_Q_values) # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass # current_Q_values.backward(d_error.data.unsqueeze(1)) loss.backward() # Clip the gradients to lie between -1 and +1 for params in Q.parameters(): params.grad.data.clamp_(-1, 1) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress and keep track of statistics episode_reward += reward # episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() episode_rewards.append(episode_reward) if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t,)) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl') ### 5. Save a checkpoint if t % SAVE_EVERY_N_STEPS == 0 and t > learning_starts: save_checkpoint({ 'epoch': t + 1, 'model_state_dict': Q.state_dict(), 'target_state_dict': target_Q.state_dict(), 'optimizer' : optimizer.state_dict(), }, "checkpoints/checkpoint.%d.tar" % t)
def dqn_learing( env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000 ): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.size # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return torch.IntTensor([[model(Variable(obs)).data.max(1)[1].cpu()]]) else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 writer = SummaryWriter() for t in count(): ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done = env.step(action) replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)) # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (gamma * next_Q_values) loss = F.smooth_l1_loss(current_Q_values, target_Q_values.unsqueeze(1)) optimizer.zero_grad() loss.backward() # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) # ### 4. Log progress and keep track of statistics episode_rewards = env.get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if len(episode_rewards) > 0: writer.add_scalar('data/DQN/score', episode_rewards[-1], len(episode_rewards)) writer.add_scalar('data/DQN/mean_score', mean_episode_reward, len(episode_rewards)) if len(episode_rewards) > 100: writer.add_scalar('data/DQN/best_mean_score', best_mean_episode_reward, len(episode_rewards)) #LOG 저장 if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t,)) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() torch.save(Q, 'DQN_net1029.pt') writer.close()
experts.append(model) # with model.graph.as_default(): print("LOADED ALL MODELS") for i in range(len(experts)): guide = experts[i] guide_experience = [[]] num_points = 0 state = env.reset() guide_replay_buffer = ReplayBuffer( config.buffer_size, config.state_history) while True: # store last state in buffer idx = guide_replay_buffer.store_frame(state) q_input = guide_replay_buffer.encode_recent_observation() action, _ = guide.get_best_action(q_input) # perform action in env new_state, reward, done, info = env.step(action) # store in replay memory guide_replay_buffer.store_effect(idx, action, reward, done) if len(guide_experience) <= num_points: guide_experience.append([]) guide_experience[num_points].append((state, action, 0)) state = new_state if abs(reward) == 1: cur_point_lis = guide_experience[num_points] for k in range(len(cur_point_lis)): index = int(len(cur_point_lis) - k - 1) if k == 0: cur_point_lis[index] = (
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.gpu and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # arguments LAMBDA = [1.0, 0.0, 1.0, 10e-5] # for [loss_dq, loss_n_dq, loss_jeq, loss_l2] CUDA_VISIBLE_DEVICES = 0 seed = args.seed train = args.train demo = args.demo task = args.task iteration = 3 convs = [(32, 7, 3), (64, 4, 2), (64, 3, 1)] non_pixel_layer = [64] in_feature = 7 * 7 * 64 hidden_actions = [128] hidden_value = [128] aggregator = "reduceLocalMean" dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor #if not train: # args.num_env_steps = 50000 base_kwargs = { 'non_pixel_layer': non_pixel_layer, 'convs': convs, 'frame_history_len': args.frame_history_len, 'in_feature': in_feature, 'hidden_actions': hidden_actions, 'hidden_value': hidden_value, 'aggregator': aggregator } # logger logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # threads and device torch.set_num_threads(1) device = torch.device("cuda:0" if args.gpu else "cpu") print("device:", device) gpu = args.gpu if (gpu == True): print("current available gpu numbers: %d" % torch.cuda.device_count()) if torch.cuda.is_available(): torch.cuda.set_device(CUDA_VISIBLE_DEVICES) print("CUDA Device: %d" % torch.cuda.current_device()) # envs #envs = gym.make(task) #obs_space = env.observation_space #act_space = env.action_space #action_template = env.action_space.noop() env = gym.make(args.task) obs_space = env.observation_space act_space = env.action_space action_template = env.action_space.noop() # policy actor_critic = Policy(obs_space, act_space, base_kwargs=base_kwargs) actor_critic.to(device) # algorithm if args.algo == 'ppo': agent = PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=7e-4, eps=1e-5, max_grad_norm=args.max_grad_norm, ) else: raise NotImplementedError # storage replay_buffer = None if args.frame_history_len > 1: _, _, non_pixel_shape = parse_obs_space(obs_space) add_non_pixel = True if non_pixel_shape > 0 else False replay_buffer = ReplayBuffer(100000, args.frame_history_len, non_pixel_shape, add_non_pixel) rollouts = RolloutStorage(replay_buffer, args.frame_history_len, args.num_steps, args.num_processes, obs_space, act_space) obs = env.reset() #print("reset obs pov size: ",obs['pov'].shape) # obs: key: inventory.dirt... # (num_processes, size) pov, non_pixel_feature = get_obs_features(obs_space, obs) #pov, non_pixel_feature = multi_get_obs_features(obs) if args.frame_history_len > 1: last_stored_frame_idx = replay_buffer.store_frame( pov, non_pixel_feature) pov = replay_buffer.encode_recent_observation() / 255.0 # 12 h w pov = torch.from_numpy(pov.copy()).reshape(args.num_processes, *pov.shape) elif args.frame_history_len == 1: pov = pov.transpose(2, 0, 1) / 255.0 pov = torch.from_numpy(pov.copy()).reshape(args.num_processes, *pov.shape) else: raise NotImplementedError non_pixel_feature = (torch.tensor(non_pixel_feature) / 180.0).reshape( args.num_processes, -1) rollouts.obs[0].copy_(pov) rollouts.non_pixel_obs[0].copy_(non_pixel_feature) rollouts.to(device) # ? episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print("Total steps: ", args.num_env_steps) ep = 0 ep_rewards = [] #mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') #total_rewards = [0 for i in range(args.num_processes)] total_rewards = 0 for j in range(num_updates): for step in range(args.num_steps): # num_steps = 5 # Sample actions with torch.no_grad(): # actor_critic.act output size # actions: torch.Tensor, not list value, actions, action_log_probs = actor_critic.act( rollouts.obs[step], rollouts.non_pixel_obs[step]) # value size: batch x 1 # actions size: torch.Tensor num_processes x num_branches # action_log_probs : torch.Tensor num_processes x num_branches #print(actions) actions_list = actions.squeeze().tolist() action = get_actions_continuous(actions_list, act_space, action_template) # step: #print(actions) obs, reward, done, infos = env.step(action) #print('.',end='') if args.num_env_steps <= 50000: env.render() pov, non_pixel_feature = get_obs_features(obs_space, obs) #pov, non_pixel_feature = multi_get_obs_features(obs) if args.frame_history_len > 1: last_stored_frame_idx = replay_buffer.store_frame( pov, non_pixel_feature) pov = replay_buffer.encode_recent_observation( ) / 255.0 # 12 h w pov = torch.from_numpy(pov.copy()).reshape( args.num_processes, *pov.shape) elif args.frame_history_len == 1: pov = pov.transpose(2, 0, 1) / 255.0 pov = torch.from_numpy(pov.copy()).reshape( args.num_processes, *pov.shape) else: raise NotImplementedError non_pixel_feature = (torch.tensor(non_pixel_feature) / 180.0).reshape(args.num_processes, -1) total_rewards += reward #for i in range(len(reward)): # total_rewards[i] += reward[i] reward = torch.tensor([reward]).reshape(args.num_processes, -1).type(dtype) # TODO: may not need bas_masks masks = torch.FloatTensor([[0.0] if done else [1.0]]) bad_masks = torch.FloatTensor([[1.0]]) if done: ep += 1 ep_rewards.append(total_rewards) best_mean_episode_reward = log(j, args.task, ep, np.array(ep_rewards), best_mean_episode_reward) obs = env.reset() pov, non_pixel_feature = get_obs_features(obs_space, obs) #pov, non_pixel_feature = multi_get_obs_features(obs) if args.frame_history_len > 1: last_stored_frame_idx = replay_buffer.store_frame( pov, non_pixel_feature) pov = replay_buffer.encode_recent_observation( ) / 255.0 # 12 h w pov = torch.from_numpy(pov.copy()).reshape( args.num_processes, *pov.shape) elif args.frame_history_len == 1: pov = pov.transpose(2, 0, 1) / 255.0 pov = torch.from_numpy(pov.copy()).reshape( args.num_processes, *pov.shape) else: raise NotImplementedError non_pixel_feature = (torch.tensor(non_pixel_feature) / 180.0).reshape(args.num_processes, -1) total_rewards = 0 # ? rollouts.insert(pov, non_pixel_feature, actions, action_log_probs, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.non_pixel_obs[-1]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) # TODO: minibathc = 32, 1 processes x 10 step should larger than 32 value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_model_dir != '': save_path = os.path.join(args.save_model_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save(actor_critic, os.path.join(save_path, args.task + ".pt")) if j % args.log_interval == 0 and len(ep_rewards) >= 0: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print("----------- Logs -------------") if len(ep_rewards) == 0: print( "Updates {}, num timesteps {}, FPS {} \nThe {}th training episodes," .format(j, total_num_steps, int(total_num_steps / (end - start)), len(ep_rewards))) else: print( "Updates {}, num timesteps {}, FPS {} \nThe {}th training episodes,\nmean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(ep_rewards), np.mean(ep_rewards), np.median(ep_rewards), np.min(ep_rewards), np.max(ep_rewards))) print("-----------------------Training ends-----------------------") env.close()
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ ############################ # BUILD MODEL / 모델 만들기 # ############################ # Observation 크기에 따라 Q function에 들어갈 input_arg 설정 if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c # Simulator에서의 action의 갯수 받아옴 num_actions = env.action_space.size # Construct an epilson greedy policy with given exploration schedule # Epsilon greedy 함수 설정 ## random으로 뽑은 sample값과 dqn learning의 exploration schedule과 비교, 결과에 맞게 epsilon greedy action policy를 줌 def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) # if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return torch.IntTensor( [[model(Variable(obs)).data.max(1)[1].cpu()]]) else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function # Q function과 target Q function 생성 Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function # Optimizer_spec을 이용하여 optimizer function을 만듦 optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer # ReplayBuffer을 이용해 replay_buffer 생성 replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### #초기값 설정 num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 # TensorboardX를 모니터함 writer = SummaryWriter() # t가 0부터 loop이 돌 때 마다 하나씩 커짐. 몇 번의 iteration이 실행되었는지 확인 가능 for t in count(): ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done # 가장 최근의 결과 이미지가 replay_buffer에 저장되고 그때의 action, reward, 그리고 끝남의 여부가 last_idx에 저장됨 last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. #replay_buffer에 저장된 buffer중 가장 최근 것을 불러 직전의 frame들과 비교, Q network에 들어갈 input을 계산 recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning # t가 learning_starts보다 크다면, 즉 충분한 iteration이 진행되었다면 action을 random값이 아닌 learning에 의한 값으로 받음 if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step # action을 취하고 그에 따른 결과 이미지 (obs), 보상 (reward), 끝남 여부 (done)을 저장, replay_buffer에도 넣어줌 obs, reward, done = env.step(action) replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. # 끝이 났다면 env, 즉 학습 환경도 다시 리셋함 if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken ## 충분한 iteration으로 t가 learning_starts보다 크고, ## learning_freq의 주기와 맞고, ## buffer의 사이즈가 batch 사이즈와 비교해 충분 할 때, learning이 시작됨 if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target #replay_buffer에서 batch size에 맞는 데이터 양을 불러온다 obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation # model의 input에 맞게 numpy array에서 torch로 변환 obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. # 현재의 Q value를 계산 current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)) # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated # 어떤 action이 max Q value를 주는지에 따라 다음 Q value를 설정 next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values # 현재의 targer Q value를 optimize와 backward를 이용하여 계산 target_Q_values = rew_batch + (gamma * next_Q_values) loss = F.smooth_l1_loss(current_Q_values, target_Q_values.unsqueeze(1)) optimizer.zero_grad() loss.backward() # Perfom the update # 업데이트 후 업데이트 횟수도 업데이트 optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network # targer 업데이트 주기에 맞을때 마다 target network를 업데이트 if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) # ### 4. Log progress and keep track of statistics # episode reward 출력, 100번이 넘어가면 최고평균값도 평균값과 함께 출력 episode_rewards = env.get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) # Tensorboard에 저장 if len(episode_rewards) > 0: writer.add_scalar('data/DQN/score', episode_rewards[-1], len(episode_rewards)) writer.add_scalar('data/DQN/mean_score', mean_episode_reward, len(episode_rewards)) if len(episode_rewards) > 100: writer.add_scalar('data/DQN/best_mean_score', best_mean_episode_reward, len(episode_rewards)) # learning된 내용 출력 if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() torch.save(Q, 'DQN_net1029.pt') ## 적당한 파일에 저장 writer.close()
def dqn_learn(env, q_func, optimizer_spec, exploration, stopping_criterion, replay_buffer_size, batch_size, gamma, learning_starts, learning_freq, frame_history_len, target_update_freq, grad_norm_clipping, double_q): """Implements DQN training Parameters ---------- env : gym.Env OpenAI gym environment q_func : torch.nn.Module DQN that computes q-values for each action: (state) -> (q-value, action) optimizer_spec : OptimizerSpec parameters for the optimizer exploration : Schedule schedule for epsilon-greedy exploration stopping_criterion : func when to stop training: (env, num_timesteps) -> bool replay_buffer_size : int experience replay memory size batch_size : int batch size to sample from replay memory gamma : float discount factor learning_starts : int number of environment steps before starting the training process learning_freq : int number of environment steps between updating DQN weights frame_history_len : int number of previous frames to include as DQN input target_update_freq : int number of experience replay steps to update the target network grad_norm_clipping : float maximum size of gradients to clip to double_q : bool enable double DQN learning """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete def select_action(dqn, obs, t): """Implements epsilon-greedy exploration Parameters ---------- dqn : torch.nn.Module DQN model obs : np.ndarray Stacked input frames to evaluate t : int Current time step Returns ------- nd.array (1,1) action to take """ threshold = exploration.value(t) if random.random() > threshold: # take optimal action obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # DQN returns (q-value, action) q_values = dqn(obs) # returns (max, argmax) of q-values (max q-value, action which produces max q-value) _, action = q_values.data.max(1) else: # take a random action action = torch.IntTensor([random.randrange(num_actions)]) return action # get input sizes and num actions img_h, img_w, img_c = env.observation_space.shape in_channels = frame_history_len * img_c input_shape = (img_h, img_w, in_channels) num_actions = env.action_space.n # construct online and target DQNs online_DQN = q_func(in_channels=in_channels, num_actions=num_actions) target_DQN = q_func(in_channels=in_channels, num_actions=num_actions) # construct optimizer optimizer = optimizer_spec.constructor(online_DQN.parameters(), **optimizer_spec.kwargs) # construct replay memory replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) # initialize main loop variables num_param_updates = 0 avg_episode_reward = float('-inf') best_avg_episode_reward = float('-inf') cumulative_avg_episode_reward = float('-inf') prev_obs = env.reset() # main training loop for t in count(): # check stopping criterion if stopping_criterion is not None and stopping_criterion(env, t): break # store transition and concatenate last frames last_idx = replay_buffer.store_frame(prev_obs) # stack previous frames into a tensor to give to DQN stacked_obs = replay_buffer.encode_recent_observation() # take random actions until we've officially started training if t > learning_starts: # select action according to epsilon-greedy action = select_action(online_DQN, stacked_obs, t)[0] else: # take a random action action = random.randrange(num_actions) # step environment obs, reward, done, _ = env.step(action) # clip reward reward = min(-1.0, max(reward, 1.0)) # store effect of taking action in prev_obs into replay memory replay_buffer.store_effect(last_idx, action, reward, done) # if game is finished, reset environment if done: obs = env.reset() prev_obs = obs # experience replay if t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample( batch_size): # sample batches obs_batch, action_batch, reward_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) obs_batch = torch.from_numpy(obs_batch).type(dtype) / 255.0 action_batch = torch.from_numpy(action_batch).long() reward_batch = torch.from_numpy(reward_batch) next_obs_batch = torch.from_numpy(next_obs_batch).type( dtype) / 255.0 not_done_mask = torch.from_numpy(1 - done_mask).type(dtype) if torch.cuda.is_available(): action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() # Compute current q-values: Q(s, a) # Select q-values based on actions we would have taken for each state # shape: (BATCH_SIZE, 1) current_q_values = online_DQN(obs_batch).gather( 1, action_batch.unsqueeze(1)) # double DQN or vanilla DQN if double_q: # compute which actions to take according to online network: argmax_a Q(s', a) greedy_actions = online_DQN(next_obs_batch).detach().max(1)[1] # compute q-values of those actions using target network: Q_hat(s', argmax_a Q(s', a)) next_q_values = target_DQN(next_obs_batch).gather( 1, greedy_actions.unsqueeze(1)) else: # Compute next q-values using target network next_q_values = target_DQN(next_obs_batch).detach().max(1)[0] next_q_values = next_q_values.unsqueeze(1) # apply mask to retain q-values next_q_values = not_done_mask.unsqueeze(1) * next_q_values """ Compute the target q-values (BATCH_SIZE, 1) y_j = r_j + gamma * max_a' Q(s', a') for vanilla DQN y_j = r_j + gamma * Q_hat(s', argmax_a Q(s', a)) for double DQN """ target_q_values = reward_batch + (gamma * next_q_values) """ Use the huber loss instead of clipping the TD error. Huber loss intuitively means we assign a much larger loss where the error is large (quadratic) Smaller errors equate to smaller losses (linear) """ loss = F.smooth_l1_loss(current_q_values, target_q_values) # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass loss.backward() # clip gradients nn.utils.clip_grad_norm_(online_DQN.parameters(), grad_norm_clipping) # update weights of dqn optimizer.step() num_param_updates += 1 # update target network weights if num_param_updates % target_update_freq == 0: target_DQN.load_state_dict(online_DQN.state_dict()) # end experience replay # log progress so far by averaging last 100 episodes episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: avg_episode_reward = np.mean(episode_rewards[-100:]) cumulative_avg_episode_reward = np.mean(episode_rewards) if len(episode_rewards) > 100: best_avg_episode_reward = max(best_avg_episode_reward, avg_episode_reward) if t % LOG_FREQ == 0 and t > learning_starts: print('-' * 64) print('Timestep {}'.format(t)) print( 'Average reward (100 episodes): {}'.format(avg_episode_reward)) print('Best average reward: {}'.format(best_avg_episode_reward)) print('Cumulative average reward: {}'.format( cumulative_avg_episode_reward)) print('Episode {}'.format(len(episode_rewards))) print('Exploration {}'.format(exploration.value(t))) print('\n') sys.stdout.flush()
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) last_frames = deque(maxlen=4) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [] embeddings = [] extractor = PongExtractor() prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < 2000: total_reward = 0 state = self.env.reset() last_frame = state last_frames.append(state) while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() feats = extractor.extract(np.squeeze(state)) # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) embedding = self.sess.run(self.hidden, feed_dict={self.s: [q_input]})[0] # embedding = self.sess.run(self.q, feed_dict={self.s: [q_input]})[0] # print embedding.shape embeddings.append(embedding) action = best_action frame = np.squeeze(state) scipy.misc.imsave( 'embeddings/breakout/breakout{}.png'.format(t), frame) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) replay_buffer.store_effect(idx, action, reward, done) state = new_state total_reward += reward if done or t >= 2000: print total_reward, t break # updates to perform at the end of an episode rewards.append(total_reward) # last words print 'Saving embeddings' np.save(open('embeddings/breakout/breakout.npy', 'w'), np.vstack(embeddings))
def train(self, exp_schedule, lr_schedule): # Initialize replay buffer and variables replay_buffer = ReplayBuffer(self.FLAGS.buffer_size, self.FLAGS.state_hist) rewards = deque(maxlen=self.FLAGS.num_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = 0 # time control of nb of steps loss_eval = grad_eval = 0 scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] self.prog = Progbar(target=self.FLAGS.train_steps) # Train for # of train steps while t < self.FLAGS.train_steps: continual_crash = 0 try: total_reward = 0 ep_len = 0 state = self.env.reset() # Run for 1 episode and update the buffer while True: ep_len += 1 # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.network.get_best_action( q_input) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state # Count reward total_reward += reward # Stop at end of episode if done: break #Store episodic rewards if ep_len > 1: rewards.append(total_reward) # Learn using replay while True: t += 1 ep_len -= 1 # Make train step if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.learn_every == 0)): loss_eval, grad_eval = self.network.update_step( t, replay_buffer, lr_schedule.epsilon, self.summary) exp_schedule.update(t) lr_schedule.update(t) if (t % self.FLAGS.target_every == 0): self.network.update_target_params() # Update logs if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0) and (len(rewards) > 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) self.update_logs(t, loss_eval, rewards, exp_schedule.epsilon, grad_eval, lr_schedule.epsilon) # Update logs if necessary elif (t < self.FLAGS.learn_start) and ( t % self.FLAGS.log_every == 0): sys.stdout.write( "\rPopulating the memory {}/{}...".format( t, self.FLAGS.learn_start)) sys.stdout.flush() if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.check_every == 0)): # Evaluate current model scores_eval += [ self.evaluate(self.env, self.FLAGS.num_test) ] # Save current Model self.network.save() # Record video of current model if self.FLAGS.record: self.record() if ep_len <= 0 or t >= self.FLAGS.train_steps: break continual_crash = 0 except Exception as e: continual_crash += 1 self.logger.info(e) if continual_crash >= 10: self.logger.info("Crashed 10 times -- stopping u suck") raise e else: t -= 1 self.logger.info("Env crash, making new env") time.sleep(60) self.env = create_slither_env(self.FLAGS.state_type) self.env = Unvectorize(self.env) self.env.configure(fps=self.FLAGS.fps, remotes=self.FLAGS.remotes, start_timeout=15 * 60, vnc_driver='go', vnc_kwargs={ 'encoding': 'tight', 'compress_level': 0, 'fine_quality_level': 50 }) time.sleep(60) # End of training self.logger.info("- Training done.") self.network.save() scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] export_plot(scores_eval, "Scores", self.FLAGS.plot_path)
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own conv-net using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of choseing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: print(env.observation_space.shape) img_h, img_w, img_c = env.observation_space.shape # input_arg = frame_history_len * img_c input_arg = frame_history_len * 1 num_actions = env.action_space.n print(env.action_space) print(f"({input_arg}): ({img_h}X{img_w}X{img_c})") # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): values = model(Variable(obs)) return values.data.max(1)[1].cpu().unsqueeze(dim=1) else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() obs = cv.cvtColor(last_obs, cv.COLOR_BGR2GRAY) obs = cv.resize(obs, dsize=(obs.shape[1] // 2, obs.shape[0] // 2)) last_obs = obs[..., np.newaxis] print("Q model:") summary(Q, input_size=(input_arg, last_obs.shape[0], last_obs.shape[1])) print("Q-TARGET model:") summary(target_Q, input_size=(input_arg, last_obs.shape[0], last_obs.shape[1])) LOG_EVERY_N_STEPS = 10000 rewards = 0. out_count = 0 for t in count(): ### Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break if t % 1e3 == 0: if out_count == 0: stdout.write("|") out_count += 1 elif out_count % 10 == 0: stdout.write(f"{out_count}|") out_count += 1 elif out_count >= 50: stdout.write("=> \n") out_count = 0 else: stdout.write(".") out_count += 1 stdout.flush() ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: values = select_epilson_greedy_action(Q, recent_observations, t) action = values[0, 0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(action) rewards += reward # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() print(len(episode_rewards), episode_rewards, rewards) rewards = 0. # print(obs.shape) # cv.imshow('now_color', obs) # cv.waitKey(1) obs = cv.cvtColor(obs, cv.COLOR_BGR2GRAY) obs = cv.resize(obs, dsize=(obs.shape[1] // 2, obs.shape[0] // 2)) obs = obs[..., np.newaxis] # cv.imshow('now', obs) # cv.waitKey(1) last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. values = Q(obs_batch) current_Q_values = values.gather(1, act_batch.unsqueeze(1)).squeeze() # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (gamma * next_Q_values) # Compute Bellman error bellman_error = target_Q_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass # current_Q_values.backward(d_error.data.unsqueeze(1)) current_Q_values.backward(d_error.data) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
def train(self, exp_schedule, lr_schedule, choose_teacher_strategy=None): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) # interact with environment allsteps = [] while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() while True: if self.config.state_subspace is not None: out_of_bounds = False if self.config.state_subspace in [ 'ball_top_half', 'ball_bottom_half' ]: image = self.env.unwrapped._get_obs() ball_position = ball_half_screen_position(image) # check if ball is in top half but we're restricted to bottom half if ball_position == 1 and self.config.state_subspace == 'ball_bottom_half': out_of_bounds = True # check if ball is in bottom half but we're restricted to top half elif ball_position == 0 and self.config.state_subspace == 'ball_top_half': out_of_bounds = True else: raise NotImplementedError if out_of_bounds: # current state is outside of this agent's state subspace # perform action in env state, reward, done, info = self.env.step(action) t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # self.q_inputs.append(q_input) # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state if choose_teacher_strategy is not None: # store the reward with the teacher choice strategy choose_teacher_strategy.store_reward(reward, q_input) # perform a training step loss_eval, grad_eval = self.train_step( t, replay_buffer, lr_schedule.epsilon, choose_teacher_strategy) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if choose_teacher_strategy is not None: choose_teacher_strategy.update_schedule(t) if len(rewards) > 0: exact = [("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)] if choose_teacher_strategy is not None and hasattr( choose_teacher_strategy, 'eps_schedule'): exact.append( ("Choose teacher eps", choose_teacher_strategy.eps_schedule.epsilon)) prog.update(t + 1, exact=exact) elif ((t > self.config.learning_start) and (t % self.config.save_teacher_choice_freq == 0) and (choose_teacher_strategy is not None)): choose_teacher_strategy.save( self.config.teacher_choice_output_path) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output) if choose_teacher_strategy is not None: choose_teacher_strategy.save( self.config.teacher_choice_output_path)
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables ############ # PLAYER 1 # ############ replay_buffer_p1 = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards_p1 = deque(maxlen=self.config.num_episodes_test) max_q_values_p1 = deque(maxlen=1000) q_values_p1 = deque(maxlen=1000) ############ # PLAYER 2 # ############ replay_buffer_p2 = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards_p2 = deque(maxlen=self.config.num_episodes_test) max_q_values_p2 = deque(maxlen=1000) q_values_p2 = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: total_reward_p1 = 0 total_reward_p2 = 0 state = self.env.reset() while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() # replay memory stuff idx_p1 = replay_buffer_p1.store_frame(state) # should get observation from last frame of p2 q_input_p1 = replay_buffer_p2.encode_recent_observation() ############ # PLAYER 1 # ############ # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) action = exp_schedule.get_action(best_action) # store q values max_q_values_p1.append(max(q_values)) q_values_p1 += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer_p1.store_effect(idx, action, reward, done) state = new_state # BEFORE MOVING TO PLAYER 2, need to check if terminal TODO ############ # PLAYER 2 # ############ idx_p2 = replay_buffer_p2.store_frame(state) q_input_p2 = replay_buffer_p1.encode_recent_observation() # TODO: need to flip the input board state print(q_input_p2) # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input_p2) action = exp_schedule.get_action(best_action) # store q values max_q_values_p2.append(max(q_values)) q_values_p2 += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer_p2.store_effect(idx, action, reward, done) state = new_state # perform a training step # loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and (t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format(t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq): self.logger.info("Recording...") last_record =0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, statistics_file_name="statistics.pkl"): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network statistics_file_name: str Where to store the statistics file """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete print("STATISTICS_FILE_NAME: {}".format(statistics_file_name)) ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type( torch_types.FloatTensor).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): return model(Variable(obs)).data.max(1)[1].cpu() else: return random.randrange(num_actions) # Initialize target q function and q function, i.e. build the model. ###### # YOUR CODE HERE policy_net = q_func(input_arg, num_actions).to(device).type( torch_types.FloatTensor) # Q target_net = q_func(input_arg, num_actions).to(device).type( torch_types.FloatTensor) # Q target target_net.load_state_dict( policy_net.state_dict()) # copies the state of policy Q into target ###### # Construct policy_net network optimizer function optimizer = optimizer_spec.constructor(policy_net.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### # YOUR CODE HERE stored_frame_idx = replay_buffer.store_frame(last_obs) last_obs_encoded = replay_buffer.encode_recent_observation() action = select_epilson_greedy_action(policy_net, last_obs_encoded, t) obs, reward, done, info = env.step(action) replay_buffer.store_effect(stored_frame_idx, action, reward, done) if done: obs = env.reset() last_obs = obs ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### # YOUR CODE HERE sample = replay_buffer.sample(batch_size) obs_batch, actions_batch, rewards_batch, next_obs_batch, done_mask = sample # convert batches to pytorch tensors: obs_batch = torch.from_numpy(obs_batch).to(device).type( torch_types.FloatTensor) / 255.0 next_obs_batch = torch.from_numpy(next_obs_batch).to(device).type( torch_types.FloatTensor) / 255.0 actions_batch = torch.from_numpy(actions_batch).to(device).type( torch_types.LongTensor) rewards_batch = torch.from_numpy(rewards_batch).to(device).type( torch_types.FloatTensor) non_final_mask = 1 - torch.from_numpy(done_mask).to(device).type( torch_types.FloatTensor) # inspired by https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html: # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = policy_net(obs_batch).gather( 1, actions_batch.unsqueeze(1)).squeeze(1) # Compute V(s_{t+1}) for all next states. next_state_values = target_net(next_obs_batch).max( 1)[0].detach() * non_final_mask # Compute the expected Q values expected_state_action_values = (next_state_values * gamma) + rewards_batch # Compute loss d_error = state_action_values - expected_state_action_values # = -bellman_error d_error.clamp_(-1, 1) # Optimize the model optimizer.zero_grad() state_action_values.backward(d_error) optimizer.step() num_param_updates += 1 # Periodically update target network: if num_param_updates % target_update_freq == 0: target_net.load_state_dict(policy_net.state_dict()) ##### ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t >= learning_starts: print("Timestep %d" % (t, )) print(" mean reward (100 episodes) %f" % mean_episode_reward) print(" best mean reward %f" % best_mean_episode_reward) print(" episodes %d" % len(episode_rewards)) print(" exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open(statistics_file_name, 'wb') as f: pickle.dump(Statistic, f)
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete if not os.path.isdir("./models"): os.mkdir("./models") if len(env.observation_space.shape) == 1: input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 with torch.no_grad(): ret = model(obs).data.max(1)[1].cpu() return ret else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') save_best_mean_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 20000 SAVE_EVERY_N_STEPS = 2000000 AL_ALPHA = 0.7 for t in count(): if stopping_criterion is not None and stopping_criterion(env): break ### Step the env and store the transition last_idx = replay_buffer.store_frame(last_obs) recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0] else: action = random.randrange(num_actions) obs, reward, done, _ = env.step(action) reward = max(-1.0, min(reward, 1.0)) replay_buffer.store_effect(last_idx, action, reward, done) if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() cur_all_Q_values = Q(obs_batch) action_gap = cur_all_Q_values.max( dim=1)[0] * cur_all_Q_values.size(1) - cur_all_Q_values.sum( dim=1) Statistic["mean_action_gap"].append(action_gap.mean().item()) current_Q_values = cur_all_Q_values.gather( 1, act_batch.unsqueeze(1)).squeeze() next_target_Q_values = target_Q(next_obs_batch).detach() next_max_q = next_target_Q_values.max(1)[0] next_Q_values = not_done_mask * next_max_q target_Q_values = rew_batch + (gamma * next_Q_values) bellman_error = target_Q_values - current_Q_values cur_target_Q_values = target_Q(obs_batch).detach() cur_advantage = cur_target_Q_values.max( dim=1)[0] - cur_target_Q_values.gather( 1, act_batch.unsqueeze(1)).squeeze() next_advantage = next_target_Q_values.max( dim=1)[0] - next_target_Q_values.gather( 1, act_batch.unsqueeze(1)).squeeze() # Set up the error according to the operator you want al_error = bellman_error - AL_ALPHA * cur_advantage persistent_error = bellman_error - AL_ALPHA * next_advantage pal_error = torch.max(al_error, persistent_error) error = pal_error # use whichever you want clipped_bellman_error = error.clamp(-1, 1) d_error = clipped_bellman_error * -1.0 optimizer.zero_grad() current_Q_values.backward(d_error.data) optimizer.step() num_param_updates += 1 if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ## Log Progress episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % './models/statistics.pkl') if save_best_mean_reward < best_mean_episode_reward: save_best_mean_reward = best_mean_episode_reward torch.save(Q.state_dict(), './models/best_model.pth') if t % SAVE_EVERY_N_STEPS == 0: torch.save(Q.state_dict(), './models/n_steps_%d.pth' % t)
t = 0 while t < 100000: t = t + 1 print(t) ### Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(action) # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset()
def dqn_learing( #env, q_func, optimizer_spec, exploration, #stopping_criterion=None, replay_buffer_size=1000, batch_size=32, gamma=0.99, learning_starts=1, learning_freq=4, frame_history_len=1, target_update_freq=10000): #our own code read_image() rgb_data = depth_data.reshape(640, 480, 1) input_arg = rgb_data #input for the algorithm num_actions = 5 last_obs = rgb_data # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(1, num_actions).type(dtype) target_Q = q_func(1, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(1000, 1) ############### # RUN ENV # ############### num_param_updates = 0 for t in count(): last_idx = replay_buffer.store_frame(last_obs) recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step control_robot(action + 1) rgb_data = depth_data.reshape(640, 480, 1) obs = rgb_data ##evaluate the action dis_data = np.array(depth_data) dis_data[np.isnan(dis_data)] = 999999999999 dis_data[dis_data == 0] = 999999999999 dis = np.min(dis_data) print("MIN DISTANCE:" + str(dis) + "-------------") reward = 0 if dis < 500: reward = 1 else: reward = -1 print("REWARD:" + str(reward) + "--------------") # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, False) # Resets the environment when reaching an episode boundary. #if done: #obs = env.reset() last_obs = obs if (t > 1 and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): print("Training") obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather( 1, act_batch.unsqueeze(1)).squeeze() # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (gamma * next_Q_values) print("next:", next_Q_values.shape) print("current:", current_Q_values.squeeze().shape) # Compute Bellman error bellman_error = target_Q_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error #.clamp(-1, 1) #print(clipped_bellman_error) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass #print(d_error.data) optimizer.zero_grad() # run backward pass current_Q_values.backward(d_error.data) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict())
def dqn_learning( env, method, game, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, double=False, dueling=False, logdir=None, svrl=False, me_type=None, maskp=None, maskstep=None, maskscheduler=True ): assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n def select_epsilon_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 with torch.no_grad(): return model(Variable(obs)).data.max(1)[1].view(1, 1) else: return torch.IntTensor([[random.randrange(num_actions)]]) Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 SAVE_MODEL_EVERY_N_STEPS = 1000000 mask_scheduler_step = (1 - maskp) / maskstep for t in count(): if stopping_criterion is not None and stopping_criterion(env): break ################ # STEP THE ENV # ################ last_idx = replay_buffer.store_frame(last_obs) recent_observations = replay_buffer.encode_recent_observation() if t > learning_starts: action = select_epsilon_greedy_action(Q, recent_observations, t)[0][0] else: action = random.randrange(num_actions) obs, reward, done, _ = env.step(action) reward = max(-1.0, min(reward, 1.0)) replay_buffer.store_effect(last_idx, action, reward, done) if done: obs = env.reset() last_obs = obs ################ # TRAINING # ################ if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # mask scheduler if maskscheduler: maskp = min(maskp + mask_scheduler_step, 1) obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size) obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)).squeeze() target_q_mat = target_Q(next_obs_batch).detach() # SV-RL scheme if svrl: target_q_mat = globals()[me_type](target_q_mat, target_q_mat.size(0), target_q_mat.size(1), maskp) if not double: next_max_q = target_q_mat.max(1)[0] else: q_temp = Q(next_obs_batch).detach() act_temp = np.argmax(q_temp.cpu(), axis=1) next_max_q = torch.sum(torch.from_numpy(np.eye(num_actions)[act_temp]).type(dtype) * target_q_mat.type(dtype), dim=1) next_Q_values = not_done_mask * next_max_q.type(dtype) target_Q_values = rew_batch + (gamma * next_Q_values) loss = F.smooth_l1_loss(current_Q_values, target_Q_values) optimizer.zero_grad() loss.backward() for params in Q.parameters(): params.grad.data.clamp_(-1, 1) optimizer.step() num_param_updates += 1 if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ################ # LOG PROGRESS # ################ # save model if t % SAVE_MODEL_EVERY_N_STEPS == 0: if not os.path.exists("models"): os.makedirs("models") add_str = 'single' if double: add_str = 'double' if dueling: add_str = 'dueling' model_save_path = 'models/%s_%s_%s.ckpt' % (str(game[:-14]), add_str, method) torch.save(Q.state_dict(), model_save_path) # log process episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: logz.log_tabular('Timestep', t) logz.log_tabular('MeanReward100Episodes', mean_episode_reward) logz.log_tabular('BestMeanReward', best_mean_episode_reward) logz.log_tabular('Episodes', len(episode_rewards)) logz.log_tabular('Exploration', exploration.value(t)) logz.dump_tabular() sys.stdout.flush()
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables if self.config.use_memory: replay_buffer = ReplayBuffer( self.config.buffer_size, self.config.state_history, memory_size=self.config.memory_unit_size) else: replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()[0]] prog = Progbar(target=self.config.nsteps_train) evaluation_result_list = [] oos_evalution_result_list = [] # interact with environment prev_time = time.time() while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() if self.config.use_memory: prev_memory = replay_buffer.encode_recent_memory() best_action, q_values, _, next_memory = self.get_best_action_with_memory( q_input, prev_memory) next_memory = np.squeeze(next_memory) else: best_action, q_values = self.get_best_action(q_input) # chose action according to current Q and exploration action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) if self.config.use_memory: replay_buffer.store_memory(idx, next_memory) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff time_log_freq = 1000 if t % time_log_freq == 0: with open(self.config.output_path + 'time_log.txt', 'a') as of: of.write('{}\n'.format(time.time() - prev_time)) of.write('\n') prev_time = time.time() if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg_R", self.avg_reward), ("Max_R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max_Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") score, complete, length = self.evaluate() if complete > 0: evaluation_result_list += [(score, complete, length)] if score > self.config.extended_eval_threshold: self.logger.info('Extended in-sample evaluation...') self.evaluate(num_episodes=1000) for _ in range(10): self.logger.info( 'Extended out-of-sample evaluation...') oos_result = self.evaluate( EnvMaze(n=self.config.maze_size), num_episodes=100) oos_evalution_result_list += [oos_result] scores_eval += [score] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()[0]] export_plot(scores_eval, "Scores", self.config.plot_output) return evaluation_result_list, oos_evalution_result_list
def dqn_learing( env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, num_actions1=31, num_actions2=27 ): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ ############### # BUILD MODEL # ############### img_h, img_w, img_c = 32, 120, 1 input_arg = frame_history_len * img_c # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) # Use volatile = True if variable is only used in inference mode, i.e. don't save the history out1, out2 = model(Variable(obs)) out1 = out1.max(1)[1].data.cpu().numpy()[0] out2 = out2.max(1)[1].data.cpu().numpy()[0] return out1, out2 else: return random.randrange(num_actions1), random.randrange(num_actions2) # Initialize target q function and q function Q = q_func(num_actions1, num_actions2).cuda(0).type(dtype) target_Q = q_func(num_actions1, num_actions2).cuda(0).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 epoch_reward = [] for t in count(): ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action1, action2 = select_epilson_greedy_action(Q, recent_observations, t) else: action1, action2 = random.randrange(num_actions1), random.randrange(num_actions2) # Advance one step obs, reward, done = env.step(action1, action2) epoch_reward.append(reward) if done: env.render() # clip rewards between -1 and 1 # reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action1, action2, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() print np.mean(epoch_reward) epoch_reward = [] torch.save(Q,'../../weights/Q' + str(num_actions1) + '.pt') torch.save(target_Q,'../../weights/target_Q' + str(num_actions1) + '.pt') last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act1_batch, act2_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype)) act1_batch = Variable(torch.from_numpy(act1_batch).long()) act2_batch = Variable(torch.from_numpy(act2_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype)) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act1_batch = act1_batch.cuda() act2_batch = act2_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only stateif stopping_criterion is not None and stopping_criterion(env): # break and output value for every state-action pair # We choose Q based on action taken. q1, q2 = Q(obs_batch) current_Q1_values = q1.gather(1, act1_batch.unsqueeze(1)) current_Q2_values = q2.gather(1, act2_batch.unsqueeze(1)) # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated tq1, tq2 = target_Q(next_obs_batch) next_max_q1 = tq1.detach().max(1)[0] next_max_q2 = tq2.detach().max(1)[0] next_Q1_values = not_done_mask * next_max_q1 next_Q2_values = not_done_mask * next_max_q2 # Compute the target of the current Q values target_Q1_values = rew_batch + (gamma * next_Q1_values) target_Q2_values = rew_batch + (gamma * next_Q2_values) # Compute Bellman error bellman_error1 = target_Q1_values.unsqueeze(1) - current_Q1_values bellman_error2 = target_Q2_values.unsqueeze(1) - current_Q2_values bellman_error = bellman_error1 + bellman_error2 # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass current_Q_values = current_Q1_values + current_Q2_values current_Q_values.backward(d_error.data) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict())
def evaluate(self, env=None, num_episodes=None): """ Evaluation with same procedure as the training """ # log our activity only if default call save_paths = False if num_episodes is None: self.logger.info("Evaluating...") else: save_paths = True # arguments defaults if num_episodes is None: num_episodes = self.config.num_episodes_test if env is None: env = self.env bfs_len = self.bfs_len else: bfs_len = env.get_bfs_length() # replay memory to play if self.config.use_memory: replay_buffer = ReplayBuffer( self.config.buffer_size, self.config.state_history, memory_size=self.config.memory_unit_size) else: replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = [] steps = [] for i in range(num_episodes): total_reward = 0 state = env.reset() count = 0 while True: if self.config.render_test: env.render() # store last state in buffer idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() if self.config.use_memory: prev_memory = replay_buffer.encode_recent_memory() action, bottom_q, top_q, next_memory = self.get_action_with_memory( q_input, prev_memory) next_memory = np.squeeze(next_memory) else: action = self.get_action(q_input) if i == 0 and self.config.use_memory: with open(self.config.output_path + 'eval_example_log.txt', 'a') as of: of.write('State = {}\n'.format(env.cur_state)) of.write('Taking action = {}\n'.format(action)) of.write('prev_memory = {}\n'.format( prev_memory[0, :6])) of.write('next_memory = {}\n'.format(next_memory[:6])) of.write('bottom_q_values = {}\n'.format(bottom_q)) of.write('top_q_values = {}\n'.format(top_q)) of.write('\n') if save_paths: with open(self.config.output_path + 'path_log.txt', 'a') as of: of.write("(s, a) = ({}, {})\n".format( env.cur_state, action)) of.write('\n') # perform action in env new_state, reward, done, info = env.step(action) # store in replay memory replay_buffer.store_effect(idx, action, reward, done) if self.config.use_memory: replay_buffer.store_memory(idx, next_memory) state = new_state count += 1 # count reward total_reward += reward if done: if save_paths: with open(self.config.output_path + 'path_log.txt', 'a') as of: of.write('\n') break # updates to perform at the end of an episode rewards.append(total_reward) if total_reward <= 0: steps.append(np.nan) else: steps.append(count) steps = np.array(steps) - bfs_len # adjust for shortest possible path avg_reward = np.mean(rewards) avg_length = np.nanmean(steps) sigma_length = np.sqrt(np.nanvar(steps) / len(steps)) percent_completed = np.count_nonzero(~np.isnan(steps)) / float( len(steps)) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) if num_episodes > 1: msg = "Average reward: {:04.2f} +/- {:04.2f}, Percent completed: {:04.2f}, Average length: {:04.2f} +/- {:04.2f}, n = {}".format( avg_reward, sigma_reward, percent_completed, avg_length, sigma_length, len(rewards)) self.logger.info(msg) return avg_reward, percent_completed, avg_length
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) last_frames = deque(maxlen=4) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [] extractor = PongExtractor() prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() last_frame = state last_frames.append(state) while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() feats = extractor.extract(np.squeeze(state)) # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) embedding = self.sess.run(self.hidden, feed_dict={self.s: [q_input]})[0] action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) if t % 100 == 0: # print state.shape # frame = np.zeros(np.squeeze(state).shape) # for f in last_frames: # frame = frame + np.squeeze(f) # frame = frame / len(last_frames) frame = np.squeeze(state) last_frame = np.squeeze(last_frame) pickle.dump( last_frames, open('frames/embedding/atari{}.p'.format(t), 'w')) for i in range(4): f = np.squeeze(last_frames[i]) scipy.misc.imsave( 'frames/embedding/atari{}.png'.format(t - 3 + i), f) # scipy.misc.imsave('frames/atari{}.png'.format(t-1),last_frame) # posfile = open('frames/atari{}.txt'.format(t),'w') # posfile.write('Opp Paddle:\t{}\n'.format(oppY)) # posfile.write('Player Paddle:\t{}\n'.format(playerY)) # posfile.write('ball x:\t{}\n'.format(ballX)) # posfile.write('ball y:\t{}\n'.format(ballY)) # posfile.close() np.savetxt('frames/embedding/pong{}.txt'.format(t), feats, fmt='%.2f') # perform action in env new_state, reward, done, info = self.env.step(action) # print "state shape:",state.shape() # store the transition replay_buffer.store_effect(idx, action, reward, done) last_frame = state state = new_state last_frames.append(state) # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
class QN(object): """ Abstract Class for implementing a Q Network """ def __init__(self, env, config, logger=None, name=None): """ Initialize Q Network and env Args: config: class with hyperparameters logger: logger instance from logging module """ # directory for training outputs self.name = name self.action_space = 3 if name == None: raise Exception("Must supply network name") name = time.strftime("_%m%d_%H%M") + "/" + name config.output_path = config.output_path.format(name) config.model_output = config.model_output.format(name) config.log_path = config.log_path.format(name) config.plot_output = config.plot_output.format(name) config.record_path = config.record_path.format(name) if not os.path.exists(config.output_path): os.makedirs(config.output_path) # store hyper params # Customise the config self.config = config self.logger = logger if logger is None: self.logger = get_logger(config.log_path) self.env = env # build model self.build() def build(self): """ Build model """ pass @property def policy(self): """ model.policy(state) = action """ return lambda state: self.get_action(state) def save(self): """ Save model parameters Args: model_path: (string) directory """ pass def initialize(self): """ Initialize variables if necessary """ pass def get_best_action(self, state): """ Returns best action according to the network Args: state: observation from gym Returns: tuple: action, q values """ raise NotImplementedError def get_action(self, state): """ Returns action with some epsilon strategy Args: state: observation from gym """ if np.random.random() < self.config.soft_epsilon: return random else: return self.get_best_action(state)[0] def update_target_params(self): """ Update params of Q' with params of Q """ raise NotImplementedError def init_averages(self): """ Defines extra attributes for tensorboard """ self.avg_reward = -21. self.max_reward = -21. self.std_reward = 0 self.avg_q = 0 self.max_q = 0 self.std_q = 0 self.eval_reward = -21. def update_averages(self, rewards, max_q_values, q_values, scores_eval): """ Update the averages Args: rewards: deque max_q_values: deque q_values: deque scores_eval: list """ self.avg_reward = np.mean(rewards) self.max_reward = np.max(rewards) self.std_reward = np.sqrt(np.var(rewards) / len(rewards)) self.max_q = np.mean(max_q_values) self.avg_q = np.mean(q_values) self.std_q = np.sqrt(np.var(q_values) / len(q_values)) if len(scores_eval) > 0: self.eval_reward = scores_eval[-1] def train(self, exp_schedule, lr_schedule, env=None): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ if env is None: env = self.env # initialize replay buffer and variables rewards = deque(maxlen=self.config.num_episodes_test) self.init_averages() self.train_init() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: env.render() action = self.train_step_pre(state, exp_schedule) cur_action = actions.trans_single(action) # perform action in env new_state, reward, done, info = env.step(cur_action) self.rewards = reward self.replay_buffer.store_effect(self.idx, self.action, reward, done) loss_eval, grad_eval = self.train_step(t, self.replay_buffer, lr_schedule.epsilon) state = new_state # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, self.max_q_values, self.q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", np.mean(rewards)), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", np.max(self.max_q_values)), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() self.save(t) # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output) def train_init(self): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables self.replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) self.max_q_values = deque(maxlen=1000) def train_step_pre(self, state, exp_schedule=None): self.idx = self.replay_buffer.store_frame(state) q_input = self.replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) if exp_schedule is None: self.action = best_action else: self.action = exp_schedule.get_action(best_action, self.action_space) # store q values self.max_q_values.append(max(q_values)) self.q_values = list(q_values) return self.action def train_step_post(self, reward, done, t, lr_schedule, train_model): self.replay_buffer.store_effect(self.idx, self.action, reward, done) if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(self.rewards, self.max_q_values, self.q_values, [0]) # perform a training step if not train_model: return 0, 0 return self.train_step(t, self.replay_buffer, lr_schedule.epsilon) def train_step(self, t, replay_buffer, lr): """ Perform training step Args: t: (int) nths step replay_buffer: buffer for sampling lr: (float) learning rate """ loss_eval, grad_eval = 0, 0 # perform training step if (t > self.config.learning_start and t % self.config.learning_freq == 0): loss_eval, grad_eval = self.update_step(t, replay_buffer, lr) # occasionaly update target network with q network if t % self.config.target_update_freq == 0: self.update_target_params() # occasionaly save the weights if (t % self.config.saving_freq == 0): self.save() return loss_eval, grad_eval def evaluate(self, env=None, num_episodes=None): """ Evaluation with same procedure as the training """ # log our activity only if default call if num_episodes is None: self.logger.info("Evaluating...") # arguments defaults if num_episodes is None: num_episodes = self.config.num_episodes_test if env is None: env = self.env # keep the replay buffer alive try: r0 = self.replay_buffer has_replay = True except Exception: has_replay = False # replay memory to play rewards = [] self.train_init() for i in range(num_episodes): total_reward = 0 state = env.reset() while True: if self.config.render_test: env.render() action = self.train_step_pre(state) cur_action = actions.trans_single(action) # perform action in env new_state, reward, done, info = env.step(cur_action) self.train_step_post(reward, done, 0, None, False) # count reward total_reward += reward if done: break state = new_state # updates to perform at the end of an episode rewards.append(total_reward) avg_reward = np.mean(rewards) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) if num_episodes > 1: msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) self.logger.info(msg) if has_replay: self.replay_buffer = r0 return avg_reward def record(self): """ Re create an env and record a video for one episode """ env = gym.make(self.config.env_name) env = gym.wrappers.Monitor(env, self.config.record_path, video_callable=lambda x: True, resume=True) env = MaxAndSkipEnv(env, skip=self.config.skip_frame) env = PreproWrapper(env, prepro=greyscale, shape=(80, 80, 1), overwrite_render=self.config.overwrite_render) self.evaluate(env, 1) def run(self, exp_schedule, lr_schedule): """ Apply procedures of training for a QN Args: exp_schedule: exploration strategy for epsilon lr_schedule: schedule for learning rate """ # initialize self.initialize() # record one game at the beginning if self.config.record: self.record() # model self.train(exp_schedule, lr_schedule) # record one game at the end if self.config.record: self.record()
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(action) # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)) # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (gamma * next_Q_values) # Compute Bellman error bellman_error = target_Q_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass current_Q_values.backward(d_error.data.unsqueeze(1)) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): print("running new version") """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function, i.e. build the model. """ ---------------------------- OUR CODE ---------------------------- """ Q = q_func(input_arg, num_actions) # The parameters are random Qtag = q_func(input_arg, num_actions) if (USE_CUDA): Q.cuda() Qtag.cuda() Qtag.load_state_dict(Q.state_dict()) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) """ ------------------------------------------------------------------ """ ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() reward = None done = None info = None LOG_EVERY_N_STEPS = 10000 startTime = time.time() for t in count(): """ Tsuf: ---- Stuff for debigging times for various places --- """ T1 = 0 t1Tmp = 0 T2 = 0 t2Tmp = 0 T3 = 0 t3Tmp = 0 T4 = 0 t4Tmp = 0 T5 = 0 t5Tmp = 0 T6 = 0 t6Tmp = 0 T7 = 0 t7Tmp = 0 T8 = 0 t8Tmp = 0 """ ----------------------------------------------------------- """ ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break #if (t>1000000): # break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) """ -------------------------- OUR CODE -------------------------- """ #store last_obs, and get latest obs's as the input for the n.n t1Tmp = time.time() cur_idx = replay_buffer.store_frame(last_obs) next_input = replay_buffer.encode_recent_observation() T1 += time.time() - t1Tmp #take random action or use the net t2Tmp = time.time() action = select_epilson_greedy_action( Q, next_input, t) #the returned action is on the CPU T2 += time.time() - t2Tmp #see what happens after we take that action t3Tmp = time.time() last_obs, reward, done, info = env.step( action) #the returned parameters are on the CPU T3 += time.time() - t3Tmp # print(t) # env.render() #store the results on the replay buffer replay_buffer.store_effect(cur_idx, action, reward, done) #on the CPU #if the simulation is done, reset the environment if (done): last_obs = env.reset() """ -------------------------------------------------------------- """ # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) """ ------------------------ OUR CODE ------------------------ """ #sample a batch of history samples t4Tmp = time.time() obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) #on CPU obs_batch = torch.from_numpy(obs_batch).type( dtype) / 255.0 # When available, move the samples batch to GPU next_obs_batch = torch.from_numpy(next_obs_batch).type( dtype) / 255.0 #GPU T4 += time.time() - t4Tmp #see which Q values the current network gives, for all obs's t5Tmp = time.time() inter_Qs = Q( Variable(obs_batch)) #input is on GPU, output is on GPU inter_Qs_chosen = Variable( torch.zeros(batch_size).type(dtype)) #GPU #take the action that was chosen before for i in range(batch_size): inter_Qs_chosen[i] = inter_Qs[i, act_batch[i]] #take only the intermediate (non-terminal) obs's inter_idx = np.where(done_mask == False)[0] #CPU inter_next_obs_batch = next_obs_batch[inter_idx, :, :, :] T5 += time.time() - t5Tmp #see what the "target" (backuped) network says for the intermediate ones t6Tmp = time.time() inter_next_Qs = Qtag( Variable(inter_next_obs_batch, volatile=True)).data.max(1)[0] #All on GPU T6 += time.time() - t6Tmp #calculate the bellman errors t7Tmp = time.time() #for final obs's, the target is just the reward targets = torch.from_numpy(rew_batch).type( dtype) #Moved rew_batch to GPU (as 'targets') for (i, idx) in enumerate(inter_idx): targets[idx] += gamma * inter_next_Qs[i] #The bellman item # errors = -(inter_Qs_chosen.data - targets)**2 #EQUATION COULD BE WRONG!! [on GPU] # for i in range(len(errors)): # if errors[i]<-1: # errors[i] = -1 # elif errors[i]>1: # errors[i] = 1 errors = inter_Qs_chosen.data - targets errors.clamp(-1, 1) T7 += time.time() - t7Tmp #train the network! (: t8Tmp = time.time() optimizer.zero_grad() inter_Qs_chosen.backward( errors) #COULD BE WRONG WAY!! [Everything is on GPU (: ] optimizer.step() T8 += time.time() - t8Tmp num_param_updates += 1 if (num_param_updates % target_update_freq == 0): Qtag.load_state_dict(Q.state_dict()) """ ---------------------------------------------------------- """ ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) Statistic["running_times"].append(int(time.time() - startTime)) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: if (PRINT_TIMES): print("-----------------------") print(T1) print(T2) print(T3) print(T4) print(T5) print(T6) print(T7) print(T8) print("-----------------------") print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')