class A3CTrainingThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 max_global_time_step):

        self.thread_index = thread_index
        self.learning_rate_input = tf.placeholder("float")
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACNetwork(ACTION_SIZE)
        self.local_network.prepare_loss(ENTROPY_BETA)

        # policy
        self.policy_trainer = AccumTrainer()
        self.policy_trainer.prepare_minimize(
            self.local_network.policy_loss,
            self.local_network.get_policy_vars())
        self.policy_accum_gradients = self.policy_trainer.accumulate_gradients(
        )
        self.policy_reset_gradients = self.policy_trainer.reset_gradients()

        self.policy_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.policy_apply_gradients = self.policy_applier.apply_gradients(
            global_network.get_policy_vars(),
            self.policy_trainer.get_accum_grad_list())

        # value
        self.value_trainer = AccumTrainer()
        self.value_trainer.prepare_minimize(
            self.local_network.value_loss, self.local_network.get_value_vars())
        self.value_accum_gradients = self.value_trainer.accumulate_gradients()
        self.value_reset_gradients = self.value_trainer.reset_gradients()

        self.value_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.value_apply_gradients = self.value_applier.apply_gradients(
            global_network.get_value_vars(),
            self.value_trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # thread0 will record score for TensorBoard
        if self.thread_index == 0:
            self.score_input = tf.placeholder(tf.int32)
            tf.scalar_summary("score", self.score_input)

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i
        #fail safe
        return len(values) - 1

    def _record_score(self, sess, summary_writer, summary_op, score, global_t):
        summary_str = sess.run(summary_op, feed_dict={self.score_input: score})
        summary_writer.add_summary(summary_str, global_t)

    def process(self, sess, global_t, summary_writer, summary_op):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # 加算された勾配をリセット
        sess.run(self.policy_reset_gradients)
        sess.run(self.value_reset_gradients)

        # shared から localにweightをコピー
        sess.run(self.sync)

        start_local_t = self.local_t

        # 5回ループ
        for i in range(LOCAL_T_MAX):
            pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
            action = self.choose_action(pi_)

            states.append(self.game_state.s_t)
            actions.append(action)
            value_ = self.local_network.run_value(sess, self.game_state.s_t)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % 100) == 0:
                print "pi=", pi_
                print " V=", value_

            # gameを実行
            self.game_state.process(action)

            # 実行した結果
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            rewards.append(reward)

            self.local_t += 1

            self.game_state.update()

            if terminal:
                terminal_end = True
                print "score=", self.episode_reward

                if self.thread_index == 0:
                    self._record_score(sess, summary_writer, summary_op,
                                       self.episode_reward, global_t)

                self.episode_reward = 0
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        # 勾配を算出して加算していく
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            sess.run(self.policy_accum_gradients,
                     feed_dict={
                         self.local_network.s: [si],
                         self.local_network.a: [a],
                         self.local_network.td: [td]
                     })

            sess.run(self.value_accum_gradients,
                     feed_dict={
                         self.local_network.s: [si],
                         self.local_network.r: [R]
                     })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.policy_apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})
        sess.run(self.value_apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if (self.thread_index == 0) and (self.local_t % 100) == 0:
            print "TIMESTEP", self.local_t

        # 進んだlocal step数を返す
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
class A3CTrainingThread(object):
  def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step):

    self.thread_index = thread_index
    self.learning_rate_input = tf.placeholder("float")
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE)
    self.local_network.prepare_loss(ENTROPY_BETA)

    # policy
    self.policy_trainer = AccumTrainer()
    self.policy_trainer.prepare_minimize( self.local_network.policy_loss,
                                          self.local_network.get_policy_vars() )
    self.policy_accum_gradients = self.policy_trainer.accumulate_gradients()
    self.policy_reset_gradients = self.policy_trainer.reset_gradients()
  
    self.policy_applier = RMSPropApplier(learning_rate = self.learning_rate_input,
                                         decay = 0.99,
                                         momentum = 0.0,
                                         epsilon = RMSP_EPSILON )
    self.policy_apply_gradients = self.policy_applier.apply_gradients(
        global_network.get_policy_vars(),
        self.policy_trainer.get_accum_grad_list() )

    # value
    self.value_trainer = AccumTrainer()
    self.value_trainer.prepare_minimize( self.local_network.value_loss,
                                         self.local_network.get_value_vars() )
    self.value_accum_gradients = self.value_trainer.accumulate_gradients()
    self.value_reset_gradients = self.value_trainer.reset_gradients()
  
    self.value_applier = RMSPropApplier(learning_rate = self.learning_rate_input,
                                        decay = 0.99,
                                        momentum = 0.0,
                                        epsilon = RMSP_EPSILON )
    self.value_apply_gradients = self.value_applier.apply_gradients(
        global_network.get_value_vars(),
        self.value_trainer.get_accum_grad_list() )
    
    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # thread0 will record score for TensorBoard
    if self.thread_index == 0:
      self.score_input = tf.placeholder(tf.int32)
      tf.scalar_summary("score", self.score_input)

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)
    
    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i;
    #fail safe
    return len(values)-1

  def _record_score(self, sess, summary_writer, summary_op, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      self.score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def process(self, sess, global_t, summary_writer, summary_op):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # 加算された勾配をリセット
    sess.run( self.policy_reset_gradients )
    sess.run( self.value_reset_gradients )

    # shared から localにweightをコピー
    sess.run( self.sync )

    start_local_t = self.local_t
    
    # 5回ループ
    for i in range(LOCAL_T_MAX):
      pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      value_ = self.local_network.run_value(sess, self.game_state.s_t)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % 100) == 0:
        print "pi=", pi_
        print " V=", value_

      # gameを実行
      self.game_state.process(action)

      # 実行した結果
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      rewards.append(reward)

      self.local_t += 1

      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print "score=", self.episode_reward

        if self.thread_index == 0:        
          self._record_score(sess, summary_writer, summary_op, self.episode_reward, global_t)
          
        self.episode_reward = 0
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    # 勾配を算出して加算していく
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      sess.run( self.policy_accum_gradients,
                feed_dict = {
                    self.local_network.s: [si],
                    self.local_network.a: [a],
                    self.local_network.td: [td] } )
      
      sess.run( self.value_accum_gradients,
                feed_dict = {
                    self.local_network.s: [si],
                    self.local_network.r: [R] } )

    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.policy_apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )
    sess.run( self.value_apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if (self.thread_index == 0) and (self.local_t % 100) == 0:
      print "TIMESTEP", self.local_t

    # 進んだlocal step数を返す
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
Exemple #3
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    '''
    '''
    ##################
    #  shared policy #
    ##################

    tic = time.clock()

    manarger = MPManager()
    manarger.start()

    shared_env, shared_obs_dim, shared_act_dim = init_gym(env_name)
    shared_obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    shared_logger = Logger(logname=env_name, now=now + "-Master")
    shared_aigym_path = os.path.join('./vedio', env_name, now + "-Master")
    #env = wrappers.Monitor(env, aigym_path, force=True)
    shared_scaler = Scaler(shared_obs_dim)

    shared_val_func = NNValueFunction(shared_obs_dim, hid1_mult, -1, None)
    shared_policy = Policy(shared_obs_dim, shared_act_dim, kl_targ, hid1_mult,
                           policy_logvar, -1, None)

    learning_rate_input = tf.placeholder("float")
    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=RMSP_ALPHA,
                                  momentum=0.0,
                                  epsilon=RMSP_EPSILON,
                                  clip_norm=GRAD_NORM_CLIP,
                                  device=device)

    # lacal policy declair
    env_a = [None] * N_WORKERS
    obs_dim_a = [None] * N_WORKERS
    act_dim_a = [None] * N_WORKERS
    logger_a = [None] * N_WORKERS
    aigym_path_a = [None] * N_WORKERS
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    val_func_a = [None] * N_WORKERS
    policy_a = [None] * N_WORKERS
    scaler_a = [None] * N_WORKERS
    for i in range(N_WORKERS):
        env_a[i], obs_dim_a[i], act_dim_a[i] = init_gym(env_name)
        obs_dim_a[
            i] += 1  # add 1 to obs dimension for time step feature (see run_episode())
        logger_a[i] = Logger(logname=env_name, now=now + "-" + str(i))
        aigym_path_a[i] = os.path.join('./vedio', env_name, now + "-" + str(i))
        #env_a[i] = wrappers.Monitor(env, aigym_path, force=True)
        scaler_a[i] = Scaler(obs_dim_a[i])

        val_func_a[i] = NNValueFunction(obs_dim_a[i], hid1_mult, i,
                                        shared_val_func)
        val_func_a[i].apply_gradients = grad_applier.apply_gradients(
            shared_val_func.get_vars(), val_func_a[i].gradients)

        policy_a[i] = Policy(obs_dim_a[i], act_dim_a[i], kl_targ, hid1_mult,
                             policy_logvar, i, shared_policy)
        policy_a[i].apply_gradients = grad_applier.apply_gradients(
            shared_policy.get_vars(), policy_a[i].gradients)

    # init tensorflow
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                            allow_soft_placement=True))
    init = tf.global_variables_initializer()

    ## start sess
    sess.run(init)

    ## init shared scalar policy
    run_policy(sess,
               shared_env,
               shared_policy,
               shared_scaler,
               shared_logger,
               episodes=5)

    def single_work(thread_idx):
        """ training loop

        Args:
            env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
            num_episodes: maximum number of episodes to run
            gamma: reward discount factor (float)
            lam: lambda from Generalized Advantage Estimate
            kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
            batch_size: number of episodes per policy training batch
            hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
            policy_logvar: natural log of initial policy variance
        """
        env = env_a[thread_idx]
        policy = policy_a[thread_idx]
        #obs_dim = obs_dim_a[thread_idx]
        #act_dim = act_dim_a[thread_idx]
        logger = logger_a[thread_idx]
        aigym_path = aigym_path_a[thread_idx]
        scaler = scaler_a[thread_idx]
        val_func = val_func_a[thread_idx]

        print("=== start thread " + str(policy.get_thread_idx()) + " " +
              policy.get_scope() + " ===")
        print(shared_policy.get_vars())
        print(policy.get_vars())

        # run a few episodes of untrained policy to initialize scaler:
        #run_policy(sess, env, policy, scaler, logger, episodes=5)

        #policy.sync(shared_policy)
        #val_func.sync(shared_val_func)
        episode = 0

        while episode < num_episodes:

            ## copy global var into local
            sess.run(policy.sync)
            sess.run(val_func.sync)

            ## compute new model on local policy
            trajectories = run_policy(sess,
                                      env,
                                      policy,
                                      scaler,
                                      logger,
                                      episodes=batch_size)
            episode += len(trajectories)
            add_value(sess, trajectories,
                      val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)
            # add various stats to training log:
            log_batch_stats(observes, actions, advantages, disc_sum_rew,
                            logger, episode,
                            time.clock() - tic)

            policy.update(sess, observes, actions, advantages,
                          logger)  # update policy
            val_func.fit(sess, observes, disc_sum_rew,
                         logger)  # update value function

            #cur_learning_rate = self._anneal_learning_rate(global_t)
            feed_dict = {
                policy.old_log_vars_ph: policy.old_log_vars_np,
                policy.old_means_ph: policy.old_means_np,
                policy.obs_ph: observes,
                policy.act_ph: actions,
                policy.advantages_ph: advantages,
                policy.beta_ph: policy.beta,
                policy.lr_ph: policy.lr,
                policy.eta_ph: policy.eta,
                learning_rate_input: policy.lr
            }

            sess.run(policy.apply_gradients, feed_dict)

            shared_policy.update(sess, observes, actions, advantages,
                                 shared_logger)

            feed_dict = {
                val_func.obs_ph: observes,
                val_func.val_ph: disc_sum_rew,
                learning_rate_input: val_func.lr
            }

            sess.run(val_func.apply_gradients, feed_dict)

            shared_val_func.fit(sess, observes, disc_sum_rew, shared_logger)

            shared_logger.log({'_Time': time.clock() - tic})

            logger.write(
                display=True)  # write logger results to file and stdout

        logger.close()

    ## end def single work

    train_threads = []
    for i in range(N_WORKERS):
        train_threads.append(threading.Thread(target=single_work, args=(i, )))

    [t.start() for t in train_threads]
    [t.join() for t in train_threads]

    saver = tf.train.Saver()
    for i in range(N_WORKERS):
        logger_a[i].close()

    #path = os.path.join('log-files', env_name, now+'-Master', 'checkpoint')
    #saver.save(sess, path )

    sess.close()