def __init__(self, args): super(ActorLearner, self).__init__() self.summ_base_dir = args.summ_base_dir self.local_step = 0 self.global_step = args.global_step self.local_episode = 0 self.last_saving_step = 0 self.saver = None self.actor_id = args.actor_id self.alg_type = args.alg_type self.use_monitor = args.use_monitor self.max_local_steps = args.max_local_steps self.optimizer_type = args.opt_type self.optimizer_mode = args.opt_mode self.num_actions = args.num_actions self.initial_lr = args.initial_lr self.lr_annealing_steps = args.lr_annealing_steps self.num_actor_learners = args.num_actor_learners self.is_train = args.is_train self.input_shape = args.input_shape self.reward_clip_val = args.reward_clip_val self.q_update_interval = args.q_update_interval self.restore_checkpoint = args.restore_checkpoint self.random_seed = args.random_seed # Shared mem vars self.learning_vars = args.learning_vars if self.optimizer_mode == 'local': if self.optimizer_type == 'rmsprop': self.opt_st = np.ones(self.learning_vars.size, dtype=ctypes.c_float) else: self.opt_st = np.zeros(self.learning_vars.size, dtype=ctypes.c_float) elif self.optimizer_mode == 'shared': self.opt_st = args.opt_state # rmsprop/momentum self.alpha = args.momentum # adam self.b1 = args.b1 self.b2 = args.b2 self.e = args.e if args.env == 'GYM': from environments.atari_environment import AtariEnvironment self.emulator = AtariEnvironment( args.game, self.random_seed, args.visualize, use_rgb=args.use_rgb, frame_skip=args.frame_skip, agent_history_length=args.history_length, max_episode_steps=args.max_episode_steps, single_life_episodes=args.single_life_episodes, ) elif args.env == 'ALE': from environments.emulator import Emulator self.emulator = Emulator( args.rom_path, args.game, args.visualize, self.actor_id, self.random_seed, args.single_life_episodes) else: raise Exception('Invalid environment `{}`'.format(args.env)) self.grads_update_steps = args.grads_update_steps self.max_global_steps = args.max_global_steps self.gamma = args.gamma self.rescale_rewards = args.rescale_rewards self.max_achieved_reward = -float('inf') if self.rescale_rewards: self.thread_max_reward = 1.0 # Barrier to synchronize all actors after initialization is done self.barrier = args.barrier self.game = args.game # Initizlize Tensorboard summaries self.summary_ph, self.update_ops, self.summary_ops = self.setup_summaries() self.summary_op = tf.summary.merge_all()
class ActorLearner(Process): def __init__(self, args): super(ActorLearner, self).__init__() self.summ_base_dir = args.summ_base_dir self.local_step = 0 self.global_step = args.global_step self.local_episode = 0 self.last_saving_step = 0 self.saver = None self.actor_id = args.actor_id self.alg_type = args.alg_type self.use_monitor = args.use_monitor self.max_local_steps = args.max_local_steps self.optimizer_type = args.opt_type self.optimizer_mode = args.opt_mode self.num_actions = args.num_actions self.initial_lr = args.initial_lr self.lr_annealing_steps = args.lr_annealing_steps self.num_actor_learners = args.num_actor_learners self.is_train = args.is_train self.input_shape = args.input_shape self.reward_clip_val = args.reward_clip_val self.q_update_interval = args.q_update_interval self.restore_checkpoint = args.restore_checkpoint self.random_seed = args.random_seed # Shared mem vars self.learning_vars = args.learning_vars if self.optimizer_mode == 'local': if self.optimizer_type == 'rmsprop': self.opt_st = np.ones(self.learning_vars.size, dtype=ctypes.c_float) else: self.opt_st = np.zeros(self.learning_vars.size, dtype=ctypes.c_float) elif self.optimizer_mode == 'shared': self.opt_st = args.opt_state # rmsprop/momentum self.alpha = args.momentum # adam self.b1 = args.b1 self.b2 = args.b2 self.e = args.e if args.env == 'GYM': from environments.atari_environment import AtariEnvironment self.emulator = AtariEnvironment( args.game, self.random_seed, args.visualize, use_rgb=args.use_rgb, frame_skip=args.frame_skip, agent_history_length=args.history_length, max_episode_steps=args.max_episode_steps, single_life_episodes=args.single_life_episodes, ) elif args.env == 'ALE': from environments.emulator import Emulator self.emulator = Emulator( args.rom_path, args.game, args.visualize, self.actor_id, self.random_seed, args.single_life_episodes) else: raise Exception('Invalid environment `{}`'.format(args.env)) self.grads_update_steps = args.grads_update_steps self.max_global_steps = args.max_global_steps self.gamma = args.gamma self.rescale_rewards = args.rescale_rewards self.max_achieved_reward = -float('inf') if self.rescale_rewards: self.thread_max_reward = 1.0 # Barrier to synchronize all actors after initialization is done self.barrier = args.barrier self.game = args.game # Initizlize Tensorboard summaries self.summary_ph, self.update_ops, self.summary_ops = self.setup_summaries() self.summary_op = tf.summary.merge_all() def compute_targets(self, rewards, R): size = len(rewards) y_batch = list() for i in reversed(range(size)): R = rewards[i] + self.gamma * R y_batch.append(R) y_batch.reverse() return y_batch def reset_hidden_state(self): """ Override in subclass if needed """ pass def is_master(self): return self.actor_id == 0 def test(self, num_episodes=100): """ Run test monitor for `num_episodes` """ rewards = list() for episode in range(num_episodes): s = self.emulator.get_initial_state() self.reset_hidden_state() total_episode_reward = 0 episode_over = False while not episode_over: a = self.choose_next_action(s)[0] s, reward, episode_over = self.emulator.next(a) total_episode_reward += reward else: rewards.append(total_episode_reward) logger.info("EPISODE {0} -- REWARD: {1}, RUNNING AVG: {2:.1f}±{3:.1f}, BEST: {4}".format( episode, total_episode_reward, np.array(rewards).mean(), 2*np.array(rewards).std(), max(rewards), )) def synchronize_workers(self): if self.is_master(): # Initialize network parameters g_step = checkpoint_utils.restore_vars(self.saver, self.session, self.game, self.alg_type, self.max_local_steps, self.restore_checkpoint) self.global_step.val.value = g_step self.last_saving_step = g_step logger.debug("T{}: Initializing shared memory...".format(self.actor_id)) self.update_shared_memory() # Wait until actor 0 finishes initializing shared memory self.barrier.wait() if not self.is_master(): logger.debug("T{}: Syncing with shared memory...".format(self.actor_id)) self.sync_net_with_shared_memory(self.local_network, self.learning_vars) if hasattr(self, 'target_network'): self.sync_net_with_shared_memory(self.target_network, self.learning_vars) elif hasattr(self, 'batch_network'): self.sync_net_with_shared_memory(self.batch_network, self.learning_vars) # Ensure we don't add any more nodes to the graph self.session.graph.finalize() self.start_time = time.time() def get_gpu_options(self): return tf.GPUOptions(allow_growth=True) @contextmanager def monitored_environment(self): if self.use_monitor: self.log_dir = tempfile.mkdtemp() self.emulator.env = gym.wrappers.Monitor(self.emulator.env, self.log_dir) yield self.emulator.env.close() def run(self): #set random seeds so we can reproduce runs np.random.seed(self.random_seed) tf.set_random_seed(self.random_seed) num_cpus = multiprocessing.cpu_count() self.supervisor = tf.train.Supervisor( init_op=tf.global_variables_initializer(), local_init_op=tf.global_variables_initializer(), logdir=self.summ_base_dir, saver=self.saver, summary_op=None) session_context = self.supervisor.managed_session(config=tf.ConfigProto( intra_op_parallelism_threads=num_cpus, inter_op_parallelism_threads=num_cpus, gpu_options=self.get_gpu_options(), allow_soft_placement=True)) with self.monitored_environment(), session_context as self.session: self.synchronize_workers() if self.is_train: self.train() else: self.test() def save_vars(self, force = False): if (self.is_master() and self.global_step.value()-self.last_saving_step >= CHECKPOINT_INTERVAL) or force: self.last_saving_step = self.global_step.value() checkpoint_utils.save_vars(self.saver, self.session, self.game, self.alg_type, self.max_local_steps, self.last_saving_step) def update_shared_memory(self): # Initialize shared memory with tensorflow var values params = self.session.run(self.local_network.params) # Merge all param matrices into a single 1-D array params = np.hstack([p.reshape(-1) for p in params]) np.frombuffer(self.learning_vars.vars, ctypes.c_float)[:] = params # if hasattr(self, 'target_vars'): # target_params = self.session.run(self.target_network.params) # np.frombuffer(self.target_vars.vars, ctypes.c_float)[:] = params @only_on_train(return_val=0.0) def decay_lr(self): if self.global_step.value() <= self.lr_annealing_steps: return self.initial_lr - (self.global_step.value() * self.initial_lr / self.lr_annealing_steps) else: return 0.0 def apply_gradients_to_shared_memory_vars(self, grads): self._apply_gradients_to_shared_memory_vars(grads, self.learning_vars) @only_on_train() def _apply_gradients_to_shared_memory_vars(self, grads, shared_vars): opt_st = self.opt_st self.flat_grads = np.empty(shared_vars.size, dtype=ctypes.c_float) #Flatten grads offset = 0 for g in grads: self.flat_grads[offset:offset + g.size] = g.reshape(-1) offset += g.size g = self.flat_grads shared_vars.step.value += 1 T = shared_vars.step.value if self.optimizer_type == "adam" and self.optimizer_mode == "shared": p = np.frombuffer(shared_vars.vars, ctypes.c_float) p_size = shared_vars.size m = np.frombuffer(opt_st.ms, ctypes.c_float) v = np.frombuffer(opt_st.vs, ctypes.c_float) opt_st.lr.value = 1.0 * opt_st.lr.value * (1 - self.b2**T)**0.5 / (1 - self.b1**T) apply_grads_adam(m, v, g, p, p_size, opt_st.lr.value, self.b1, self.b2, self.e) elif self.optimizer_type == "adamax" and self.optimizer_mode == "shared": beta_1 = .9 beta_2 = .999 lr = opt_st.lr.value p = np.frombuffer(shared_vars.vars, ctypes.c_float) p_size = shared_vars.size m = np.frombuffer(opt_st.ms, ctypes.c_float) u = np.frombuffer(opt_st.vs, ctypes.c_float) apply_grads_adamax(m, u, g, p, p_size, lr, beta_1, beta_2, T) else: #local or shared rmsprop/momentum lr = self.decay_lr() if (self.optimizer_mode == "local"): m = opt_st else: #shared m = np.frombuffer(opt_st.vars, ctypes.c_float) p = np.frombuffer(shared_vars.vars, ctypes.c_float) p_size = shared_vars.size _type = 0 if self.optimizer_type == "momentum" else 1 apply_grads_mom_rmsprop(m, g, p, p_size, _type, lr, self.alpha, self.e) def rescale_reward(self, reward): if self.rescale_rewards: # Rescale immediate reward by max reward encountered thus far if np.abs(reward) > self.thread_max_reward: self.thread_max_reward = np.abs(reward) return reward/self.thread_max_reward else: # Clip immediate reward return np.sign(reward) * np.minimum(self.reward_clip_val, np.abs(reward)) def assign_vars(self, dest_net, params): feed_dict = {} offset = 0 for i, var in enumerate(dest_net.params): shape = var.get_shape().as_list() size = np.prod(shape) if type(params) == list: feed_dict[dest_net.params_ph[i]] = params[i] else: feed_dict[dest_net.params_ph[i]] = \ params[offset:offset+size].reshape(shape) offset += size self.session.run(dest_net.sync_with_shared_memory, feed_dict=feed_dict) def sync_net_with_shared_memory(self, dest_net, shared_mem_vars): feed_dict = {} offset = 0 params = np.frombuffer(shared_mem_vars.vars, ctypes.c_float) for i in range(len(dest_net.params)): shape = shared_mem_vars.var_shapes[i] size = np.prod(shape) feed_dict[dest_net.params_ph[i]] = \ params[offset:offset+size].reshape(shape) offset += size self.session.run(dest_net.sync_with_shared_memory, feed_dict=feed_dict) def _get_summary_vars(self): episode_reward = tf.Variable(0., name='episode_reward') s1 = tf.summary.scalar('Episode_Reward_{}'.format(self.actor_id), episode_reward) mean_value = tf.Variable(0., name='mean_value') s2 = tf.summary.scalar('Mean_Value_{}'.format(self.actor_id), mean_value) mean_entropy = tf.Variable(0., name='mean_entropy') s3 = tf.summary.scalar('Mean_Entropy_{}'.format(self.actor_id), mean_entropy) return [episode_reward, mean_value, mean_entropy] def setup_summaries(self): summary_vars = self._get_summary_vars() summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))] update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))] with tf.control_dependencies(update_ops): summary_ops = tf.summary.merge_all() return summary_placeholders, update_ops, summary_ops @only_on_train() def log_summary(self, *args): if self.is_master(): feed_dict = {ph: val for ph, val in zip(self.summary_ph, args)} summaries = self.session.run(self.update_ops + [self.summary_op], feed_dict=feed_dict)[-1] self.supervisor.summary_computed(self.session, summaries, global_step=self.global_step.value()) def cleanup(self): self.save_vars(True) self.session.close()
def __init__(self, args): super(ActorLearner, self).__init__() self.summ_base_dir = args.summ_base_dir self.local_step = 0 self.global_step = args.global_step self.local_episode = 0 self.last_saving_step = 0 self.filename = str(args.game)+"_"+str(args.alg_type)+"_"+str(args.actor_id)+"_minimize_local" self.saver = None self.actor_id = args.actor_id self.visdom = args.visdom # launch python -m visdom.server self.vis = Visualizer(self.actor_id,self.visdom,args.num_actions) ## default port is 8098 http://localhost:8097. actor_id self.alg_type = args.alg_type #print("self.alg.type is: {}".format(self.alg_type)) self.use_monitor = args.use_monitor self.max_local_steps = args.max_local_steps self.optimizer_type = args.opt_type self.optimizer_mode = args.opt_mode self.num_actions = args.num_actions self.initial_lr = args.initial_lr self.lr_annealing_steps = args.lr_annealing_steps self.num_actor_learners = args.num_actor_learners self.is_train = args.is_train self.input_shape = args.input_shape self.reward_clip_val = args.reward_clip_val self.q_update_interval = args.q_update_interval self.restore_checkpoint = args.restore_checkpoint self.random_seed = args.random_seed # Shared mem vars if self.alg_type != "AE": self.learning_vars = args.learning_vars else: self.learning_vars_lower = args.learning_vars_lower self.learning_vars_upper = args.learning_vars_upper if self.optimizer_mode == 'local': if self.alg_type != "AE": if self.optimizer_type == 'rmsprop': self.opt_st = np.ones(self.learning_vars.size, dtype=ctypes.c_float) else: self.opt_st = np.zeros(self.learning_vars.size, dtype=ctypes.c_float) else: if self.optimizer_type == 'rmsprop': self.opt_st = np.ones(self.learning_vars_lower.size, dtype=ctypes.c_float) else: self.opt_st = np.zeros(self.learning_vars_lower.size, dtype=ctypes.c_float) elif self.optimizer_mode == 'shared': self.opt_st = args.opt_state_lower # rmsprop/momentum self.alpha = args.momentum # adam self.b1 = args.b1 self.b2 = args.b2 self.e = args.e if args.env == 'GYM': from environments.atari_environment import AtariEnvironment self.emulator = AtariEnvironment( args.game, self.random_seed, args.visualize, use_rgb=args.use_rgb, frame_skip=args.frame_skip, agent_history_length=args.history_length, max_episode_steps=args.max_episode_steps, single_life_episodes=args.single_life_episodes, ) elif args.env == 'ALE': from environments.emulator import Emulator self.emulator = Emulator( args.rom_path, args.game, args.visualize, self.actor_id, self.random_seed, args.single_life_episodes) else: raise Exception('Invalid environment `{}`'.format(args.env)) self.grads_update_steps = args.grads_update_steps self.max_global_steps = args.max_global_steps self.gamma = args.gamma self.rescale_rewards = args.rescale_rewards self.max_achieved_reward = -float('inf') if self.rescale_rewards: self.thread_max_reward = 1.0 # Barrier to synchronize all actors after initialization is done self.barrier = args.barrier self.game = args.game # Initizlize Tensorboard summaries self.summary_ph, self.update_ops, self.summary_ops = self.setup_summaries() self.summary_op = tf.summary.merge_all() # open log file each agent #with open(str(self.filename), 'w') as file_name: file_name = open(str(self.filename), 'w') file_name.seek(0) file_name.truncate() self.opened_log_file = file_name self.wr = csv.writer(file_name, quoting=csv.QUOTE_ALL)
class ActorLearner(Process): def __init__(self, args): super(ActorLearner, self).__init__() self.summ_base_dir = args.summ_base_dir self.local_step = 0 self.global_step = args.global_step self.local_episode = 0 self.last_saving_step = 0 self.actor_id = args.actor_id self.alg_type = args.alg_type self.max_local_steps = args.max_local_steps self.optimizer_type = args.opt_type self.optimizer_mode = args.opt_mode self.num_actions = args.num_actions self.initial_lr = args.initial_lr self.lr_annealing_steps = args.lr_annealing_steps self.num_actor_learners = args.num_actor_learners self.is_train = args.is_train self.input_shape = args.input_shape self.reward_clip_val = args.reward_clip_val self.restore_checkpoint = args.restore_checkpoint # Shared mem vars self.learning_vars = args.learning_vars size = self.learning_vars.size self.flat_grads = np.empty(size, dtype=ctypes.c_float) if self.optimizer_mode == 'local': if self.optimizer_type == 'rmsprop': self.opt_st = np.ones(size, dtype=ctypes.c_float) else: self.opt_st = np.zeros(size, dtype=ctypes.c_float) elif self.optimizer_mode == 'shared': self.opt_st = args.opt_state # rmsprop/momentum self.alpha = args.momentum # adam self.b1 = args.b1 self.b2 = args.b2 self.e = args.e if args.env == 'GYM': from environments.atari_environment import AtariEnvironment self.emulator = AtariEnvironment( args.game, args.visualize, frame_skip=args.frame_skip, single_life_episodes=args.single_life_episodes, ) elif args.env == 'ALE': from environments.emulator import Emulator self.emulator = Emulator(args.rom_path, args.game, args.visualize, self.actor_id, args.random_seed, args.single_life_episodes) else: raise Exception('Invalid environment `{}`'.format(args.env)) self.grads_update_steps = args.grads_update_steps self.max_global_steps = args.max_global_steps self.gamma = args.gamma self.rescale_rewards = args.rescale_rewards self.max_achieved_reward = -1000000 if self.rescale_rewards: self.thread_max_reward = 1.0 # Barrier to synchronize all actors after initialization is done self.barrier = args.barrier self.summary_ph, self.update_ops, self.summary_ops = self.setup_summaries( ) self.game = args.game def reset_hidden_state(self): ''' Override in subclass if needed ''' pass def is_master(self): return self.actor_id == 0 def test(self, num_episodes=100): ''' Run test monitor for `num_episodes` ''' log_dir = tempfile.mkdtemp() self.emulator.env.monitor.start(log_dir) self.sync_net_with_shared_memory(self.local_network, self.learning_vars) rewards = list() logger.info('writing monitor log to {}'.format(log_dir)) for episode in range(num_episodes): s = self.emulator.get_initial_state() self.reset_hidden_state() total_episode_reward = 0 episode_over = False while not episode_over: a = self.choose_next_action(s)[0] s, reward, episode_over = self.emulator.next(a) total_episode_reward += reward else: rewards.append(total_episode_reward) logger.info( "EPISODE {0} -- REWARD: {1}, RUNNING AVG: {2:.0f}±{3:.0f}, BEST: {4}" .format( episode, total_episode_reward, np.array(rewards).mean(), 2 * np.array(rewards).std(), max(rewards), )) self.emulator.env.monitor.close() def run(self): gpu_options = tf.GPUOptions(allow_growth=True) self.session = tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) if self.is_master(): #Initizlize Tensorboard summaries self.summary_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter( "{}/{}".format(self.summ_base_dir, self.actor_id), self.session.graph) # Initialize network parameters g_step = checkpoint_utils.restore_vars(self.saver, self.session, self.game, self.alg_type, self.max_local_steps, self.restore_checkpoint) self.global_step.val.value = g_step self.last_saving_step = g_step logger.debug("T{}: Initializing shared memory...".format( self.actor_id)) self.update_shared_memory() # Wait until actor 0 finishes initializing shared memory self.barrier.wait() # Ensure we don't add any more nodes to the graph self.session.graph.finalize() if not self.is_master(): logger.debug("T{}: Syncing with shared memory...".format( self.actor_id)) self.sync_net_with_shared_memory(self.local_network, self.learning_vars) if hasattr(self, 'target_vars'): self.sync_net_with_shared_memory(self.target_network, self.target_vars) # Wait until all actors are ready to start self.barrier.wait() # Introduce a different start delay for each actor, so that they do not run in synchronism. # This is to avoid concurrent updates of parameters as much as possible time.sleep(0.1877 * self.actor_id) self.start_time = time.time() def save_vars(self): if self.is_master() and self.global_step.value( ) - self.last_saving_step >= CHECKPOINT_INTERVAL: self.last_saving_step = self.global_step.value() checkpoint_utils.save_vars(self.saver, self.session, self.game, self.alg_type, self.max_local_steps, self.last_saving_step) def update_shared_memory(self): # Initialize shared memory with tensorflow var values params = self.session.run(self.local_network.params) # Merge all param matrices into a single 1-D array params = np.hstack([p.reshape(-1) for p in params]) np.frombuffer(self.learning_vars.vars, ctypes.c_float)[:] = params if hasattr(self, 'target_vars'): np.frombuffer(self.target_vars.vars, ctypes.c_float)[:] = params #memoryview(self.learning_vars.vars)[:] = params #memoryview(self.target_vars.vars)[:] = memoryview(self.learning_vars.vars) @checkpoint_utils.only_on_train(return_val=0.0) def decay_lr(self): if self.global_step.value() <= self.lr_annealing_steps: return self.initial_lr - (self.global_step.value() * self.initial_lr / self.lr_annealing_steps) else: return 0.0 def apply_gradients_to_shared_memory_vars(self, grads): self._apply_gradients_to_shared_memory_vars(grads, self.opt_st) @checkpoint_utils.only_on_train() def _apply_gradients_to_shared_memory_vars(self, grads, opt_st): #Flatten grads offset = 0 for g in grads: self.flat_grads[offset:offset + g.size] = g.reshape(-1) offset += g.size g = self.flat_grads self.learning_vars.step.value += 1 T = self.learning_vars.step.value if self.optimizer_type == "adam" and self.optimizer_mode == "shared": p = np.frombuffer(self.learning_vars.vars, ctypes.c_float) p_size = self.learning_vars.size m = np.frombuffer(opt_st.ms, ctypes.c_float) v = np.frombuffer(opt_st.vs, ctypes.c_float) opt_st.lr.value = 1.0 * opt_st.lr.value * (1 - self.b2**T)**0.5 / ( 1 - self.b1**T) apply_grads_adam(m, v, g, p, p_size, opt_st.lr.value, self.b1, self.b2, self.e) elif self.optimizer_type == "adamax" and self.optimizer_mode == "shared": beta_1 = .9 beta_2 = .999 lr = opt_st.lr.value p = np.frombuffer(self.learning_vars.vars, ctypes.c_float) p_size = self.learning_vars.size m = np.frombuffer(opt_st.ms, ctypes.c_float) u = np.frombuffer(opt_st.vs, ctypes.c_float) apply_grads_adamax(m, u, g, p, p_size, lr, beta_1, beta_2, T) else: #local or shared rmsprop/momentum lr = self.decay_lr() if (self.optimizer_mode == "local"): m = opt_st else: #shared m = np.frombuffer(opt_st.vars, ctypes.c_float) p = np.frombuffer(self.learning_vars.vars, ctypes.c_float) p_size = self.learning_vars.size _type = 0 if self.optimizer_type == "momentum" else 1 #print "BEFORE", "RMSPROP m", m[0], "GRAD", g[0], self.flat_grads[0], self.flat_grads2[0] apply_grads_mom_rmsprop(m, g, p, p_size, _type, lr, self.alpha, self.e) #print "AFTER", "RMSPROP m", m[0], "GRAD", g[0], self.flat_grads[0], self.flat_grads2[0] def rescale_reward(self, reward): if self.rescale_rewards: """ Rescale immediate reward by max reward encountered thus far. """ if np.abs(reward) > self.thread_max_reward: self.thread_max_reward = np.abs(reward) return reward / self.thread_max_reward else: """ Clip immediate reward """ return np.sign(reward) * np.minimum(self.reward_clip_val, np.abs(reward)) def assign_vars(self, dest_net, params): feed_dict = {} offset = 0 for i, var in enumerate(dest_net.params): shape = var.get_shape().as_list() size = np.prod(shape) feed_dict[dest_net.params_ph[i]] = \ params[offset:offset+size].reshape(shape) offset += size self.session.run(dest_net.sync_with_shared_memory, feed_dict=feed_dict) def sync_net_with_shared_memory(self, dest_net, shared_mem_vars): feed_dict = {} offset = 0 params = np.frombuffer(shared_mem_vars.vars, ctypes.c_float) for i in xrange(len(dest_net.params)): shape = shared_mem_vars.var_shapes[i] size = np.prod(shape) feed_dict[dest_net.params_ph[i]] = \ params[offset:offset+size].reshape(shape) offset += size self.session.run(dest_net.sync_with_shared_memory, feed_dict=feed_dict) def _get_summary_vars(self): episode_reward = tf.Variable(0., name='episode_reward') s1 = tf.summary.scalar('Episode_Reward_{}'.format(self.actor_id), episode_reward) mean_entropy = tf.Variable(0., name='mean_entropy') s2 = tf.summary.scalar('Mean_Entropy_{}'.format(self.actor_id), mean_entropy) return [episode_reward, mean_entropy] def setup_summaries(self): summary_vars = self._get_summary_vars() summary_placeholders = [ tf.placeholder('float') for _ in range(len(summary_vars)) ] update_ops = [ summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars)) ] with tf.control_dependencies(update_ops): summary_ops = tf.summary.merge_all() return summary_placeholders, update_ops, summary_ops
def __init__(self, args): super(ActorLearner, self).__init__() self.summ_base_dir = args.summ_base_dir self.local_step = 0 self.global_step = args.global_step self.local_episode = 0 self.last_saving_step = 0 self.actor_id = args.actor_id self.alg_type = args.alg_type self.max_local_steps = args.max_local_steps self.optimizer_type = args.opt_type self.optimizer_mode = args.opt_mode self.num_actions = args.num_actions self.initial_lr = args.initial_lr self.lr_annealing_steps = args.lr_annealing_steps self.num_actor_learners = args.num_actor_learners self.is_train = args.is_train self.input_shape = args.input_shape self.reward_clip_val = args.reward_clip_val self.restore_checkpoint = args.restore_checkpoint # Shared mem vars self.learning_vars = args.learning_vars size = self.learning_vars.size self.flat_grads = np.empty(size, dtype = ctypes.c_float) if (self.optimizer_mode == "local"): if (self.optimizer_type == "rmsprop"): self.opt_st = np.ones(size, dtype = ctypes.c_float) else: self.opt_st = np.zeros(size, dtype = ctypes.c_float) elif (self.optimizer_mode == "shared"): self.opt_st = args.opt_state # rmsprop/momentum self.alpha = args.alpha # adam self.b1 = args.b1 self.b2 = args.b2 self.e = args.e if args.env == 'GYM': from environments.atari_environment import AtariEnvironment self.emulator = AtariEnvironment( args.game, args.visualize, frame_skip=args.frame_skip, single_life_episodes=args.single_life_episodes, ) elif args.env == 'ALE': from environments.emulator import Emulator self.emulator = Emulator( args.rom_path, args.game, args.visualize, self.actor_id, args.random_seed, args.single_life_episodes) else: raise Exception('Invalid environment `{}`'.format(args.env)) self.grads_update_steps = args.grads_update_steps self.max_global_steps = args.max_global_steps self.gamma = args.gamma self.rescale_rewards = args.rescale_rewards self.max_achieved_reward = -1000000 if self.rescale_rewards: self.thread_max_reward = 1.0 # Barrier to synchronize all actors after initialization is done self.barrier = args.barrier self.summary_ph, self.update_ops, self.summary_ops = self.setup_summaries() self.game = args.game