class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step): self.thread_index = thread_index self.learning_rate_input = tf.placeholder("float") self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE) self.local_network.prepare_loss(ENTROPY_BETA) # policy self.policy_trainer = AccumTrainer() self.policy_trainer.prepare_minimize( self.local_network.policy_loss, self.local_network.get_policy_vars()) self.policy_accum_gradients = self.policy_trainer.accumulate_gradients( ) self.policy_reset_gradients = self.policy_trainer.reset_gradients() self.policy_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) self.policy_apply_gradients = self.policy_applier.apply_gradients( global_network.get_policy_vars(), self.policy_trainer.get_accum_grad_list()) # value self.value_trainer = AccumTrainer() self.value_trainer.prepare_minimize( self.local_network.value_loss, self.local_network.get_value_vars()) self.value_accum_gradients = self.value_trainer.accumulate_gradients() self.value_reset_gradients = self.value_trainer.reset_gradients() self.value_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) self.value_apply_gradients = self.value_applier.apply_gradients( global_network.get_value_vars(), self.value_trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # thread0 will record score for TensorBoard if self.thread_index == 0: self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input) def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i #fail safe return len(values) - 1 def _record_score(self, sess, summary_writer, summary_op, score, global_t): summary_str = sess.run(summary_op, feed_dict={self.score_input: score}) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op): states = [] actions = [] rewards = [] values = [] terminal_end = False # 加算された勾配をリセット sess.run(self.policy_reset_gradients) sess.run(self.value_reset_gradients) # shared から localにweightをコピー sess.run(self.sync) start_local_t = self.local_t # 5回ループ for i in range(LOCAL_T_MAX): pi_ = self.local_network.run_policy(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) value_ = self.local_network.run_value(sess, self.game_state.s_t) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # gameを実行 self.game_state.process(action) # 実行した結果 reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward rewards.append(reward) self.local_t += 1 self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward if self.thread_index == 0: self._record_score(sess, summary_writer, summary_op, self.episode_reward, global_t) self.episode_reward = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # 勾配を算出して加算していく for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 sess.run(self.policy_accum_gradients, feed_dict={ self.local_network.s: [si], self.local_network.a: [a], self.local_network.td: [td] }) sess.run(self.value_accum_gradients, feed_dict={ self.local_network.s: [si], self.local_network.r: [R] }) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.policy_apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) sess.run(self.value_apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # 進んだlocal step数を返す diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step): self.thread_index = thread_index self.learning_rate_input = tf.placeholder("float") self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE) self.local_network.prepare_loss(ENTROPY_BETA) # policy self.policy_trainer = AccumTrainer() self.policy_trainer.prepare_minimize( self.local_network.policy_loss, self.local_network.get_policy_vars() ) self.policy_accum_gradients = self.policy_trainer.accumulate_gradients() self.policy_reset_gradients = self.policy_trainer.reset_gradients() self.policy_applier = RMSPropApplier(learning_rate = self.learning_rate_input, decay = 0.99, momentum = 0.0, epsilon = RMSP_EPSILON ) self.policy_apply_gradients = self.policy_applier.apply_gradients( global_network.get_policy_vars(), self.policy_trainer.get_accum_grad_list() ) # value self.value_trainer = AccumTrainer() self.value_trainer.prepare_minimize( self.local_network.value_loss, self.local_network.get_value_vars() ) self.value_accum_gradients = self.value_trainer.accumulate_gradients() self.value_reset_gradients = self.value_trainer.reset_gradients() self.value_applier = RMSPropApplier(learning_rate = self.learning_rate_input, decay = 0.99, momentum = 0.0, epsilon = RMSP_EPSILON ) self.value_apply_gradients = self.value_applier.apply_gradients( global_network.get_value_vars(), self.value_trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # thread0 will record score for TensorBoard if self.thread_index == 0: self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input) def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 def _record_score(self, sess, summary_writer, summary_op, score, global_t): summary_str = sess.run(summary_op, feed_dict={ self.score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op): states = [] actions = [] rewards = [] values = [] terminal_end = False # 加算された勾配をリセット sess.run( self.policy_reset_gradients ) sess.run( self.value_reset_gradients ) # shared から localにweightをコピー sess.run( self.sync ) start_local_t = self.local_t # 5回ループ for i in range(LOCAL_T_MAX): pi_ = self.local_network.run_policy(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) value_ = self.local_network.run_value(sess, self.game_state.s_t) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # gameを実行 self.game_state.process(action) # 実行した結果 reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward rewards.append(reward) self.local_t += 1 self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward if self.thread_index == 0: self._record_score(sess, summary_writer, summary_op, self.episode_reward, global_t) self.episode_reward = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # 勾配を算出して加算していく for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 sess.run( self.policy_accum_gradients, feed_dict = { self.local_network.s: [si], self.local_network.a: [a], self.local_network.td: [td] } ) sess.run( self.value_accum_gradients, feed_dict = { self.local_network.s: [si], self.local_network.r: [R] } ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.policy_apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) sess.run( self.value_apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # 進んだlocal step数を返す diff_local_t = self.local_t - start_local_t return diff_local_t
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): ''' ''' ################## # shared policy # ################## tic = time.clock() manarger = MPManager() manarger.start() shared_env, shared_obs_dim, shared_act_dim = init_gym(env_name) shared_obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories shared_logger = Logger(logname=env_name, now=now + "-Master") shared_aigym_path = os.path.join('./vedio', env_name, now + "-Master") #env = wrappers.Monitor(env, aigym_path, force=True) shared_scaler = Scaler(shared_obs_dim) shared_val_func = NNValueFunction(shared_obs_dim, hid1_mult, -1, None) shared_policy = Policy(shared_obs_dim, shared_act_dim, kl_targ, hid1_mult, policy_logvar, -1, None) learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) # lacal policy declair env_a = [None] * N_WORKERS obs_dim_a = [None] * N_WORKERS act_dim_a = [None] * N_WORKERS logger_a = [None] * N_WORKERS aigym_path_a = [None] * N_WORKERS now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories val_func_a = [None] * N_WORKERS policy_a = [None] * N_WORKERS scaler_a = [None] * N_WORKERS for i in range(N_WORKERS): env_a[i], obs_dim_a[i], act_dim_a[i] = init_gym(env_name) obs_dim_a[ i] += 1 # add 1 to obs dimension for time step feature (see run_episode()) logger_a[i] = Logger(logname=env_name, now=now + "-" + str(i)) aigym_path_a[i] = os.path.join('./vedio', env_name, now + "-" + str(i)) #env_a[i] = wrappers.Monitor(env, aigym_path, force=True) scaler_a[i] = Scaler(obs_dim_a[i]) val_func_a[i] = NNValueFunction(obs_dim_a[i], hid1_mult, i, shared_val_func) val_func_a[i].apply_gradients = grad_applier.apply_gradients( shared_val_func.get_vars(), val_func_a[i].gradients) policy_a[i] = Policy(obs_dim_a[i], act_dim_a[i], kl_targ, hid1_mult, policy_logvar, i, shared_policy) policy_a[i].apply_gradients = grad_applier.apply_gradients( shared_policy.get_vars(), policy_a[i].gradients) # init tensorflow sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) init = tf.global_variables_initializer() ## start sess sess.run(init) ## init shared scalar policy run_policy(sess, shared_env, shared_policy, shared_scaler, shared_logger, episodes=5) def single_work(thread_idx): """ training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ env = env_a[thread_idx] policy = policy_a[thread_idx] #obs_dim = obs_dim_a[thread_idx] #act_dim = act_dim_a[thread_idx] logger = logger_a[thread_idx] aigym_path = aigym_path_a[thread_idx] scaler = scaler_a[thread_idx] val_func = val_func_a[thread_idx] print("=== start thread " + str(policy.get_thread_idx()) + " " + policy.get_scope() + " ===") print(shared_policy.get_vars()) print(policy.get_vars()) # run a few episodes of untrained policy to initialize scaler: #run_policy(sess, env, policy, scaler, logger, episodes=5) #policy.sync(shared_policy) #val_func.sync(shared_val_func) episode = 0 while episode < num_episodes: ## copy global var into local sess.run(policy.sync) sess.run(val_func.sync) ## compute new model on local policy trajectories = run_policy(sess, env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(sess, trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode, time.clock() - tic) policy.update(sess, observes, actions, advantages, logger) # update policy val_func.fit(sess, observes, disc_sum_rew, logger) # update value function #cur_learning_rate = self._anneal_learning_rate(global_t) feed_dict = { policy.old_log_vars_ph: policy.old_log_vars_np, policy.old_means_ph: policy.old_means_np, policy.obs_ph: observes, policy.act_ph: actions, policy.advantages_ph: advantages, policy.beta_ph: policy.beta, policy.lr_ph: policy.lr, policy.eta_ph: policy.eta, learning_rate_input: policy.lr } sess.run(policy.apply_gradients, feed_dict) shared_policy.update(sess, observes, actions, advantages, shared_logger) feed_dict = { val_func.obs_ph: observes, val_func.val_ph: disc_sum_rew, learning_rate_input: val_func.lr } sess.run(val_func.apply_gradients, feed_dict) shared_val_func.fit(sess, observes, disc_sum_rew, shared_logger) shared_logger.log({'_Time': time.clock() - tic}) logger.write( display=True) # write logger results to file and stdout logger.close() ## end def single work train_threads = [] for i in range(N_WORKERS): train_threads.append(threading.Thread(target=single_work, args=(i, ))) [t.start() for t in train_threads] [t.join() for t in train_threads] saver = tf.train.Saver() for i in range(N_WORKERS): logger_a[i].close() #path = os.path.join('log-files', env_name, now+'-Master', 'checkpoint') #saver.save(sess, path ) sess.close()