def test_extend_prioritized(): nvals = 16 states = [np.random.rand(2, 2) for _ in range(nvals)] actions = [np.random.rand(2) for _ in range(nvals)] rewards = [np.random.rand() for _ in range(nvals)] newstate = [np.random.rand(2, 2) for _ in range(nvals)] done = [np.random.randint(0, 2) for _ in range(nvals)] size = 32 alpha = 0.99 baseline = PrioritizedReplayBuffer(size, alpha) ext = PrioritizedReplayBuffer(size, alpha) for data in zip(states, actions, rewards, newstate, done): baseline.add(*data) states, actions, rewards, newstates, done = map( np.array, [states, actions, rewards, newstate, done]) ext.extend(states, actions, rewards, newstates, done) assert len(baseline) == len(ext) # Check buffers have same values for i in range(nvals): for j in range(5): condition = (baseline.storage[i][j] == ext.storage[i][j]) if isinstance(condition, np.ndarray): # for obs, obs_t1 assert np.all(condition) else: # for done, reward action assert condition # assert priorities assert (baseline._it_min._value == ext._it_min._value).all() assert (baseline._it_sum._value == ext._it_sum._value).all()
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for step in range(total_timesteps): if callback is not None: callback(locals(), globals()) # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(step) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(step) + self.exploration.value(step) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, step) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if step > self.learning_starts and step % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(step)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + step) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % step) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, step) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if step > self.learning_starts and step % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(step))) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, initial_p=1.0): self.actions_weights = [] self.actions_container = [] new_tb_log = self._init_num_timesteps(reset_num_timesteps) cnt = 0 ds_rewards = [[0, 0]] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=initial_p, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True ''' Hierarchical Step (Start) ''' obs, new_obs, rew, action, done, reset = self.hierarchical_step( obs, ds_rewards, cnt, kwargs, update_eps) ''' Hierarchical Step (End) ''' if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: try: new_priorities = np.array([ abs(x) for x in td_errors.tolist() ]) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) except AssertionError: print(td_errors) if self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self, ds_rewards
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] obs = self.env.reset() reset = True ############################################################ # MODIFICATION: # Track list of actions taken each episode. This is # intentionally not a set so that we can use np.isin. action_list = list() ############################################################ for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): #################################################### # MODIFICATION: # Rename variable from original, since it's now # going to come back as an array due to the # modified build_act function being used to # construct everything. action_arr = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] #################################################### # ORIGINAL: # action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] ######################################################## # MODIFICATION: # Get the best action that has not yet been taken this # episode. action = \ action_arr[np.argmin(np.isin(action_arr, action_list))] # Add this action to the list. action_list.append(action) ######################################################## env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: #################################################### # MODIFICATION: # Clear the list. action_list.clear() #################################################### maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) # callback = self._init_callback(callback) # with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ # as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [[0.0] * self.num_agents] #MA-MOD episode_successes = [] #callback.on_training_start(locals(), globals()) #callback.on_rollout_start() reset = True obs = self.env.reset() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): env_action = [] # MA-MOD for i in range(self.num_agents ): # MA-MOD. This is fine for one policy. action = self.act[i]( np.array(obs[i])[None], update_eps=update_eps, **kwargs )[0] # TODO: Is this the correct way to get the correct agent obs? env_action.append(action) reset = False new_obs, rew, done, info = self.env.step( env_action ) # NOUPDATE - env.step should take a vector of actions ''' Obs: x_me, x_opp --- agent 1. In env: x_1, x_2 Obs: x_me, x_opp -- agent 2. In env: x_2, x_1 Env: (n_agents, state_dim) ''' self.num_timesteps += 1 # Stop training if return value is False # if callback.on_step() is False: # break # Store transition in the replay buffer. # Loop for replay buffer -- either separate or joined. obs[agent_index], action[agent_index], reward[agent_index] # Joey: Does this look right to you? # print(obs, action, rew, new_obs, done) #print("obs",obs[0]) #print(action) #print("ac", action[0]) #print("rew", rew[0]) #print("done", done[0]) for num_agent in range(self.num_agents): self.replay_buffer.add(obs[num_agent], env_action[num_agent], rew[num_agent], new_obs[num_agent], float(done[num_agent])) obs = new_obs # if writer is not None: # ep_rew = np.array([rew]).reshape((1, -1)) # ep_done = np.array([done]).reshape((1, -1)) # tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, # self.num_timesteps) # TODO: current episode_rewards is a list, make it a list of lists where each list is the reward for each agent in all timesteps # append the newest reward to the end of each list for each agent for num_agent in range(self.num_agents): #MA-MOD episode_rewards[-1][num_agent] += rew[num_agent] if done.any(): maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append([0.0] * self.num_agents) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # callback.on_rollout_end() for i in range(self.num_agents): # MA-MOD # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking # if writer is not None: # # run loss backprop with summary, but once every 100 steps save the metadata # # (memory, compute time, ...) # if (1 + self.num_timesteps) % 100 == 0: # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # run_metadata = tf.RunMetadata() # summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, # dones, weights, sess=self.sess, options=run_options, # run_metadata=run_metadata) # writer.add_run_metadata(run_metadata, 'step%d_agent%d' % (self.num_timesteps, i)) # else: # summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, # dones, weights, sess=self.sess) # writer.add_summary(summary, self.num_timesteps) # else: td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: # NOUPDATE - not inside main agent for loop new_priorities = np.abs( td_errors) + self.prioritized_replay_eps # NOUPDATE assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) # callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. for i in range(self.num_agents): self.update_target[i](sess=self.sess) # MA-MOD if len(episode_rewards[-101:-1]) == 0: # MA-MOD mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) #MA-MOD # below is what's logged in terminal. num_episodes = len(episode_rewards) #MA-MOD if self.verbose >= 1 and done.any( ) and log_interval is not None and len( episode_rewards) % log_interval == 0: #MA-MOD logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] Globals.env = self.env obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) timesteps_last_log = 0 avr_ep_len_per_log = None sleep = 0.045 for _ in range(total_timesteps): if Globals.loading: Globals.loading = False while Globals.pause_game: pass if Globals.exit_learning: break if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample: sleep = 0.035 time.sleep(sleep) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) if len(episode_rewards) % log_interval == 0: avr_ep_len_per_log = (self.num_timesteps - timesteps_last_log) / log_interval timesteps_last_log = self.num_timesteps num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.record_tabular("avr length of last logged ep", avr_ep_len_per_log) logger.dump_tabular() self.num_timesteps += 1 Globals.steps -= 1 return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, distinct_replay_buffer=False): new_tb_log = self._init_num_timesteps(reset_num_timesteps) for i, m in enumerate(self.sub_models): m.learning_rate = get_schedule_fn(m.learning_rate) if len(self.replay_wrappers) != 0: m.replay_buffer = self.replay_wrappers[i](m.replay_buffer) m._setup_learn() with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] obs = self.env.reset() reset = True macro_count = 0 macro_len = self.macro_len macro_choices = [] n_updates = 0 for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): if reset or macro_count % macro_len == 0: macro_action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] # macro_action = 1 macro_obs = obs reward_in_one_macro = 0 macro_count += 1 macro_choices.append(macro_action) # use sub_model to decide action # env_action = self.sub_models[macro_action] current_sub = self.sub_models[macro_action] if self.num_timesteps < self.learning_starts or np.random.rand( ) < current_sub.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.env.action_space, unscaled_action) else: action = current_sub.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if current_sub.action_noise is not None: action = np.clip(action + current_sub.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.env.action_space, action) assert action.shape == self.env.action_space.shape reset = False new_obs, rew, done, info = self.env.step(unscaled_action) episode_rewards[-1] += rew # rew -= self.args.policy_cost_coef * self.args.sub_policy_costs[macro_action] reward_in_one_macro += rew - self.args.policy_cost_coef * self.args.sub_policy_costs[ macro_action] # Store transition in the replay buffer. if macro_count % macro_len == 0 or done: self.replay_buffer.add(macro_obs, macro_action, reward_in_one_macro, new_obs, float(done)) for i, m in enumerate(self.sub_models): if distinct_replay_buffer: if i == macro_action: m.replay_buffer.add(obs, action, rew, new_obs, float(done)) else: m.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) # print("step: %d, done: %d" % (self.num_timesteps, done)) if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True macro_action = None macro_count = 0 prev_macro_choices = macro_choices macro_choices = [] # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if step % self.sub_models[0].train_freq == 0: mb_infos_vals = [] for m in self.sub_models: # Update policy, critics and target networks for grad_step in range(m.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not m.replay_buffer.can_sample(m.batch_size) \ or self.num_timesteps < m.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = m.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( m._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % m.target_update_interval == 0: # Update target network m.sess.run(m.target_update_op) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) # print(done, log_interval, len(episode_rewards), self.num_timesteps) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) prev_macro_choices = np.array(prev_macro_choices) macro_choices_ratio = [ '%.2f' % ((prev_macro_choices[prev_macro_choices == i]).size / prev_macro_choices.size) for i in range(self.n_actions) ] logger.record_tabular("macro choices", macro_choices_ratio) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.logkv("n_updates_of_sub", n_updates) logger.dump_tabular() print("macro choices", prev_macro_choices) self.num_timesteps += 1 return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, learning_curve=False, test_t=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] self.cumul_reward = [0.0] episode_successes = [] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) # variables for test eval ## test_step = test_t * 3 test_results = {'sum': []} test_ts = [] for _ in range(total_timesteps): ## Test eval period ## if learning_curve and _ % test_step == 0 and _ > 0: print("--> Simulating test period") self.env.reset() test_r = 0.0 for i in range(test_t): feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask( feasible_actions, self.env.action_space.n) action, _states = self.predict(obs, mask=action_mask) action = AllocationEnv.check_action( obs['board_config'], action) obs, rewards, dones, info = self.env.step(action) test_r += rewards test_results["sum"].append(test_r) test_ts.append(_) self.env.reset() # plot test eval progress plt.plot(test_ts, test_results["sum"]) # plt.errorbar(iteration_cuts, results["mean"], yerr=results["std"], fmt='.k') plt.xlabel("Iteration count") plt.ylabel("Total (sum) test reward") plt.savefig("figs/rl-learning-curve-{}.pdf".format( cfg.vals['prj_name'])) plt.clf() plt.close() # write test eval progress write_results = {} for k, v in test_results.items(): write_results[k] = serialize_floats(v) with open( "output/rl-learning-curve-{}.json".format( cfg.vals['prj_name']), 'w') as f: json.dump(write_results, f) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask( feasible_actions, self.action_space.n) with self.sess.as_default(): action = self.act(State.get_vec_observation(obs)[None], update_eps=update_eps, **kwargs, mask=action_mask)[0] reset = False # CHECK IF ACTIONS IS FEASIBLE action = AllocationEnv.check_action(obs['board_config'], action) env_action = action new_obs, rew, done, info = self.env.step(env_action) print("action: {} - reward: {} - eps: {:.4}".format( action, rew, update_eps)) print(new_obs['day_vec']) print(new_obs['board_config']) # Store transition in the replay buffer. self.replay_buffer.add(State.get_vec_observation(obs), action, rew, State.get_vec_observation(new_obs), float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew self.cumul_reward.append(self.cumul_reward[-1] + rew) if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() print('timestamp: {}'.format(self.num_timesteps, end='\r\n')) self.num_timesteps += 1 return self
def __init__(self, args): # Load parameters from user-given arguments self.params = json_to_dict(args["path"]) os.environ["CUDA_VISIBLE_DEVICES"] = str(self.params["GPU"]) self.params['width'] = args['width'] self.params['height'] = args['height'] self.params['num_training'] = args['numTraining'] self.params['num_games'] = args['numGames'] self.path_extra = "" self.params["seed"] = args['seed'] self.random = np.random.RandomState(self.params["seed"]) self.beta_schedule = None # time started self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S", time.localtime()) self.start_time = time.time() self.rank_sort = None if self.params["prioritized"]: # For using PrioritizedReplayBuffer if self.params["ranked"]: N_list = [self.params["batch_size"]] + [int(x) for x in np.linspace(100, self.params["mem_size"], 5)] save_quantiles(N_list=N_list, k=self.params["batch_size"], alpha=self.params["prioritized_replay_alpha"], name=self.params["save_file"]) self.replay_buffer = RankBasedReplay(self.params["mem_size"], self.params["prioritized_replay_alpha"], name=self.params["save_file"]) if self.params["sort_rank"] == None: # For sorting rankbased buffer self.rank_sort = int(self.params["mem_size"] * 0.01) else: self.rank_sort = self.params["sort_rank"] else: self.replay_buffer = PrioritizedReplayBuffer(self.params["mem_size"], self.params["prioritized_replay_alpha"]) if self.params["prioritized_replay_beta_iters"] is None: prioritized_replay_beta_iters = self.params['num_training'] else: prioritized_replay_beta_iters = self.params['prioritized_replay_beta_iters'] self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.params['prioritized_replay_beta0'], final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.params["mem_size"]) self.beta_schedule = None if self.params["only_dqn"]: print("Initialise DQN Agent") elif self.params["only_lin"]: print("Initialise Linear Approximative Agent") else: print("Initialise WDQN Agent") print(self.params["save_file"]) if self.params["prioritized"]: if self.params["ranked"]: print("Using Rank-Based Experience Replay Buffer") else: print("Using Prioritized Experience Replay Buffer") if self.params["model_shift"]: print("Using Model Shift") print("seed", self.params["seed"]) print("Starting time:", self.general_record_time) # Start Tensorflow session tf.reset_default_graph() tf.set_random_seed(self.params["seed"]) self.qnet = WDQN(self.params, "model") # Q-network self.tnet = WDQN(self.params, "target_model") # Q-target-network self.saver = tf.train.Saver() self.sess = tf.Session() self.qnet.set_session(self.sess) self.tnet.set_session(self.sess) self.sess.run(tf.global_variables_initializer()) # Q and cost self.Q_global = [] # Stats self.cnt = self.qnet.sess.run(self.qnet.global_step_dqn) self.local_cnt = 0 self.wins = 0 self.best_int = self.params["shift_best"] self.numeps = 0 self.model_eps = 0 self.episodeStartTime = time.time() self.last_steps = 0 self.get_direction = lambda k: ['North', 'South', 'East', 'West', 'Stop'][k] self.get_value = {'North': 0, 'South': 1, 'East': 2, 'West': 3, 'Stop': 4} self.lastWindowAccumRewards = 0.0 self.Q_accumulative = 0.0 self.accumTrainRewards = 0.0 self.sub_dir = str(self.params["save_interval"])