class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0


  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)
    
    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i;
    #fail safe
    return len(values)-1

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # reset accumulated gradients
    sess.run( self.reset_gradients )

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t

    if USE_LSTM:
      start_lstm_state = self.local_network.lstm_state_out
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % 100) == 0:
        print "pi=", pi_
        print " V=", value_

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      # clip reward
      rewards.append( np.clip(reward, -1, 1) )

      self.local_t += 1

      # s_t1 -> s_t
      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print "score=", self.episode_reward

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        self.episode_reward = 0
        self.game_state.reset()
        if USE_LSTM:
          self.local_network.reset_state()
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    batch_si = []
    batch_a = []
    batch_td = []
    batch_R = []

    # compute and accmulate gradients
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      batch_si.append(si)
      batch_a.append(a)
      batch_td.append(td)
      batch_R.append(R)

    if USE_LSTM:
      batch_si.reverse()
      batch_a.reverse()
      batch_td.reverse()
      batch_R.reverse()

      sess.run( self.accum_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R,
                  self.local_network.initial_lstm_state: start_lstm_state,
                  self.local_network.step_size : [len(batch_a)] } )
    else:
      sess.run( self.accum_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R} )
      
    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if (self.thread_index == 0) and (self.local_t % 100) == 0:
      print "TIMESTEP", self.local_t

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               options):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step
    self.options = options

    if options.use_lstm:
      self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(options.action_size, device)

    self.local_network.prepare_loss(options.entropy_beta)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    self.indent = "         |" * self.thread_index
    self.steps = 0
    self.no_reward_steps = 0
    self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0)

    if self.options.train_episode_steps > 0:
      self.max_reward = 0.0
      self.max_episode_reward = 0.0
      self.episode_states = []
      self.episode_actions = []
      self.episode_rewards = []
      self.episode_values = []
      self.episode_liveses = []
      self.episode_scores = Episode_scores(options)
      self.tes = self.options.train_episode_steps
      if self.options.tes_list is not None:
        self.tes = self.options.tes_list[thread_index]
        print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes))
    self.initial_lives = self.game_state.initial_lives
    self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1)

    if self.options.record_new_record_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_record_dir):
          os.makedirs(self.options.record_new_record_dir)
      self.episode_screens = []

    if self.options.record_new_room_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_room_dir):
          os.makedirs(self.options.record_new_room_dir)
      self.episode_screens = []

    self.greediness = options.greediness
    self.repeat_action_ratio = options.repeat_action_ratio
    self.prev_action = 0

    
    

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values, global_t):
    # Add greediness for broader exploration
    r = random.random()
    if r < self.greediness:
      action =  int(r * len(pi_values))
    elif r < self.repeat_action_ratio:
      action = self.prev_action
    else:
      # Increase randomness of choice if no reward term is too long
      if self.no_reward_steps > self.options.no_reward_steps:
        randomness = (self.no_reward_steps - self.options.no_reward_steps) * self.options.randomness
        pi_values += randomness
        pi_values /= sum(pi_values)
        if self.local_t % self.options.randomness_log_interval == 0:
          elapsed_time = time.time() - self.start_time
          print("t={:6.0f},s={:9d},th={}:{}randomness={:.8f}".format(
                elapsed_time, global_t, self.thread_index, self.indent, randomness))

      pi_values -= np.finfo(np.float32).epsneg
      action_samples = np.random.multinomial(self.options.num_experiments, pi_values)
      action = action_samples.argmax(0)

    self.prev_action = action
    return action

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def set_start_time(self, start_time):
    self.start_time = start_time

  #@profile
  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []
    liveses = [self.game_state.lives]
    if self.tes > 0:
      if self.episode_liveses == []:
        self.episode_liveses.append(self.game_state.lives)

    terminal_end = False

    # reset accumulated gradients
    sess.run( self.reset_gradients )

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t

    if self.options.use_lstm:
      start_lstm_state = self.local_network.lstm_state_out
    
    # t_max times loop
    for i in range(self.options.local_t_max):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
      action = self.choose_action(pi_, global_t)

      states.append(self.game_state.s_t)
      actions.append(action)
      values.append(value_)
      liveses.append(self.game_state.lives)

      if (self.thread_index == 0) and (self.local_t % self.options.log_interval == 0):
        print("pi={} (thread{})".format(pi_, self.thread_index))
        print(" V={} (thread{})".format(value_, self.thread_index))

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward
      if reward > 0 and \
         (self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0"):
        elapsed_time = time.time() - self.start_time
        print("t={:6.0f},s={:4.0f},th={}:{}r={:3.0f}RM{:02d}| NEW-SCORE".format(
              elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward,
              self.game_state.room_no))

      # pseudo-count reward
      if self.options.psc_use:
        reward += self.game_state.psc_reward

      # add basic income after some no reward steps
      if self.no_reward_steps > self.options.no_reward_steps:
        reward += self.options.basic_income

      # clip reward
      if self.options.reward_clip > 0.0:
        reward = np.clip(reward, -self.options.reward_clip, self.options.reward_clip)
      rewards.append( reward )

      # collect episode log
      if self.tes > 0:
        self.episode_states.append(self.game_state.s_t)
        self.episode_actions.append(action)
        self.episode_rewards.append(reward)
        self.episode_values.append(value_)
        self.episode_liveses.append(self.game_state.lives)
        if len(self.episode_states) > self.max_history * 2:
          self.episode_states = self.episode_states[-self.max_history:]
          self.episode_actions = self.episode_actions[-self.max_history:]
          self.episode_rewards = self.episode_rewards[-self.max_history:]
          self.episode_values = self.episode_values[-self.max_history:]
          self.episode_liveses = self.episode_liveses[-self.max_history-1:]
        # requirement for OpenAI Gym: --clear-history-on-death=False
        if self.options.clear_history_on_death and (liveses[-2] > liveses[-1]):
          self.episode_states = []
          self.episode_actions = []
          self.episode_rewards = []
          self.episode_values = []
          self.episode_liveses = self.episode_liveses[-2:]
 
      self.local_t += 1

      if self.options.record_new_record_dir is not None \
         or self.options.record_new_room_dir is not None:
        screen = self.game_state.uncropped_screen
        if self.options.compress_frame:
          screen = lzma.compress(screen.tobytes(), preset=0)
        self.episode_screens.append(screen)

      # terminate if the play time is too long
      self.steps += 1
      if self.steps > self.options.max_play_steps:
        terminal = True

      # requirement for OpenAI Gym: --terminate-on-lives-lost=False
      # terminate if lives lost
      if self.terminate_on_lives_lost and (liveses[-2] > liveses[-1]):
        terminal = True

      # count no reward steps
      if self.game_state.reward == 0.0:
        self.no_reward_steps += 1
      else:
        self.no_reward_steps = 0

      # s_t1 -> s_t
      self.game_state.update()
      
      if self.local_t % self.options.score_log_interval == 0:
        elapsed_time = time.time() - self.start_time
        print("t={:6.0f},s={:9d},th={}:{}r={:3.0f}RM{:02d}| l={:.0f},v={:.5f},pr={:.5f}".format(
              elapsed_time, global_t, self.thread_index, self.indent,
              self.episode_reward, self.game_state.room_no,
              self.game_state.lives, value_, self.game_state.psc_reward))

      # if self.game_state.room_no != self.game_state.prev_room_no:
      #   elapsed_time = time.time() - self.start_time
      #   print("t={:6.0f},s={:9d},th={}:{}RM{:02d}>RM{:02d}| l={:.0f},v={:.5f},pr={:.5f}".format(
      #         elapsed_time, global_t, self.thread_index, self.indent, 
      #         self.game_state.prev_room_no, self.game_state.room_no,
      #         self.game_state.lives, value_, self.game_state.psc_reward))

      if self.tes > 0:
        if self.game_state.lives < self.episode_liveses[-2]:
          elapsed_time = time.time() - self.start_time
          print("t={:6.0f},s={:9d},th={}:{}l={:.0f}>{:.0f}RM{:02d}|".format(
                elapsed_time, global_t, self.thread_index, self.indent, 
                self.episode_liveses[-2], self.game_state.lives, self.game_state.room_no))

      # seperate steps after getting reward
      if self.game_state.reward > 0:
        if not terminal:
          break

      if terminal:
        terminal_end = True
        elapsed_time = time.time() - self.start_time
        end_mark = "end" if self.terminate_on_lives_lost else "END"
        print("t={:6.0f},s={:9d},th={}:{}r={:3.0f}@{}|".format(
              elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward, end_mark))

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        if self.tes > 0:
          if self.options.record_new_room_dir is not None \
             and self.game_state.new_room >= 0:
            dirname = "s{:09d}-th{}-r{:03.0f}-RM{:02d}".format(global_t,  self.thread_index,\
                       self.episode_reward, self.game_state.new_room)
            dirname = os.path.join(self.options.record_new_room_dir, dirname)
            os.makedirs(dirname)
            for index, screen in enumerate(self.episode_screens):
              filename = "{:06d}.png".format(index)
              filename = os.path.join(dirname, filename)
              screen_image = screen
              if self.options.compress_frame:
                screen_image = np.frombuffer(lzma.decompress(screen), dtype=np.uint8).reshape((210, 160))
              cv2.imwrite(filename, screen_image)
            print("@@@ New Room record screens saved to {}".format(dirname))

          if self.episode_reward > self.max_episode_reward:
            if self.options.record_new_record_dir is not None:
              dirname = "s{:09d}-th{}-r{:03.0f}-RM{:02d}".format(global_t,  self.thread_index,\
                         self.episode_reward, self.game_state.room_no)
              dirname = os.path.join(self.options.record_new_record_dir, dirname)
              os.makedirs(dirname)
              for index, screen in enumerate(self.episode_screens):
                filename = "{:06d}.png".format(index)
                filename = os.path.join(dirname, filename)
                screen_image = screen
                if self.options.compress_frame:
                  screen_image = np.frombuffer(lzma.decompress(screen), dtype=np.uint8).reshape((210, 160))
                cv2.imwrite(filename, screen_image)
              print("@@@ New Record screens saved to {}".format(dirname))
            self.max_episode_reward = self.episode_reward
            if self.options.record_all_non0_record:
              self.max_episode_reward = 0

          self.max_reward = 0.0
          self.episode_states = []
          self.episode_actions = []
          self.episode_rewards = []
          self.episode_values = []
          self.episode_liveses = []
          self.episode_scores.add(self.episode_reward, global_t, self.thread_index)
          if self.options.record_new_record_dir is not None \
             or self.options.record_new_room_dir is not None:
            self.episode_screens= []

        self.episode_reward = 0
        self.steps = 0
        self.no_reward_steps = 0
        self.game_state.reset()
        if self.options.use_lstm:
          self.local_network.reset_state()
        break

    if self.thread_index == 0 and self.local_t % self.options.performance_log_interval < self.options.local_t_max:
      elapsed_time = time.time() - self.start_time
      steps_per_sec = global_t / elapsed_time
      print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
            global_t,  elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

    if self.options.gym_eval:
      diff_local_t = self.local_t - start_local_t
      return diff_local_t, terminal_end

    # don't train if following condition
    # requirement for OpenAI Gym: --terminate-on-lives-lost=False
    if self.options.terminate_on_lives_lost and (self.thread_index == 0) and (not self.options.train_in_eval):
      return 0, terminal_end
    else:
      if self.tes > 0:
        _ = self.episode_scores.is_highscore(self.episode_reward)
        if self.episode_reward > self.max_reward:
          self.max_reward = self.episode_reward
          if True:
            tes = self.tes
            # requirement for OpenAI Gym: --test-extend=False
            if self.options.tes_extend and self.initial_lives != 0:
              tes *= self.options.tes_extend_ratio * (self.game_state.lives / self.initial_lives)
              if self.game_state.lives == self.initial_lives:
                tes *= 2
              tes = int(tes)
            tes = min(tes, len(self.episode_states))
            print("[OHL]SCORE={:3.0f},s={:9d},th={},lives={},steps={},tes={},RM{:02d}".format(self.episode_reward,  global_t, self.thread_index, self.game_state.lives, self.steps, tes, self.game_state.room_no))
            if tes == 0:
              states = []
              actions = []
              rewards = []
              values = []
              liveses = self.episode_liveses[-1:]
            else:
              states = self.episode_states[-tes:]
              actions = self.episode_actions[-tes:]
              rewards = self.episode_rewards[-tes:]
              values = self.episode_values[-tes:]
              liveses = self.episode_liveses[-tes-1:]
            if self.options.clear_history_after_ohl:
              self.episode_states = []
              self.episode_actions = []
              self.episode_rewards = []
              self.episode_values = []
              self.episode_liveses = self.episode_liveses[-2:]

      if len(states) > 0:
        R = 0.0
        if not terminal_end:
          R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        lives = liveses.pop()
        # compute and accmulate gradients
        for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
          # Consider the number of lives
          if (not self.options.use_gym) and self.initial_lives != 0.0 and not self.terminate_on_lives_lost:
            prev_lives = liveses.pop()
            if prev_lives > lives:
              weight = self.options.lives_lost_weight
              rratio = self.options.lives_lost_rratio
              R *= rratio * ( (1.0 - weight) + weight * (lives / prev_lives) )
              ri = self.options.lives_lost_reward
              lives = prev_lives

          R = ri + self.options.gamma * R
          td = R - Vi
          a = np.zeros([self.options.action_size])
          a[ai] = 1

          batch_si.append(si)
          batch_a.append(a)
          batch_td.append(td)
          batch_R.append(R)

        if self.options.use_lstm:
          batch_si.reverse()
          batch_a.reverse()
          batch_td.reverse()
          batch_R.reverse()

          sess.run( self.accum_gradients,
                    feed_dict = {
                      self.local_network.s: batch_si,
                      self.local_network.a: batch_a,
                      self.local_network.td: batch_td,
                      self.local_network.r: batch_R,
                      self.local_network.initial_lstm_state: start_lstm_state,
                      self.local_network.step_size : [len(batch_a)] } )
        else:
          sess.run( self.accum_gradients,
                    feed_dict = {
                      self.local_network.s: batch_si,
                      self.local_network.a: batch_a,
                      self.local_network.td: batch_td,
                      self.local_network.r: batch_R} )
          
        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run( self.apply_gradients,
                  feed_dict = { self.learning_rate_input: cur_learning_rate } )

      # return advanced local step size
      diff_local_t = self.local_t - start_local_t
      return diff_local_t, terminal_end
class A3CTrainingThread(object):

    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if USE_LSTM:
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        else:
            self.local_network = GameACFFNetwork(ACTION_SIZE, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        # TODO: don't need accum trainer anymore with batch
        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(),
          self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * \
            (self.max_global_time_step - global_time_step) / \
             self.max_global_time_step
        assert learning_rate > 0, 'Learning rate {} is not >0'.format(
            learning_rate)
        return learning_rate

    def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
        summary_str = sess.run(summary_op, feed_dict={
                               score_input: score
                               })
        summary_writer.add_summary(summary_str, global_t)

    def process(self, sess, global_t, summary_writer, summary_op, score_input):
        states = []
        actions = []
        rewards = []
        values = []

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        if USE_LSTM:
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        start_local_t = self.local_t
        terminal_end = False
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
            action = choose_action(pi_)

            states.append(self.game_state.s_t)
            actions.append(action)
            values.append(value_)

            # Debug output for progress
            if (self.thread_index == 0) and (self.local_t % 100) == 0:
                print(('local_t = {:10}  pi = ' + '{:7.5f} ' * len(pi_) + ' V = {:8.4f} (thread {})').format(self.local_t,
                                                                                                             *pi_, value_, self.thread_index))

            # process game
            self.game_state.process(action)

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            # clip reward
            # TODO: Does this make sense?
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # s_t1 -> s_t
            self.game_state.update()

            if terminal:
                terminal_end = True
                print ("score=", self.episode_reward)

                self._record_score(
                    sess, summary_writer, summary_op, score_input,
                                   self.episode_reward, global_t)

                self.episode_reward = 0
                self.game_state.reset()
                if USE_LSTM:
                    self.local_network.reset_state()
                break

        # Compute and accmulate gradients

        R = 0.0 if terminal_end else self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        # What is the meaning of these values?
        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        if USE_LSTM:
            batch_si.reverse()
            batch_a.reverse()
            batch_td.reverse()
            batch_R.reverse()

            sess.run(self.accum_gradients,
                     feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R,
                     self.local_network.initial_lstm_state: start_lstm_state,
                     self.local_network.step_size: [len(batch_a)]})
        else:
            sess.run(self.accum_gradients,
                     feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R})

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if (self.thread_index == 0) and (self.local_t % 100) == 0:
            print ("TIMESTEP", self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    with tf.device(device):
      var_refs = [v.ref() for v in self.local_network.get_vars()]
      self.gradients = tf.gradients(
        self.local_network.total_loss, var_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)

    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.gradients )
      
    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    return np.random.choice(range(len(pi_values)), p=pi_values)

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    summary_writer.flush()
    
  def set_start_time(self, start_time):
    self.start_time = start_time

  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t

    if USE_LSTM:
      start_lstm_state = self.local_network.lstm_state_out
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
        print("pi={}".format(pi_))
        print(" V={}".format(value_))

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      # clip reward
      rewards.append( np.clip(reward, -1, 1) )

      self.local_t += 1

      # s_t1 -> s_t
      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print("score={}".format(self.episode_reward))

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        self.episode_reward = 0
        self.game_state.reset()
        if USE_LSTM:
          self.local_network.reset_state()
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    batch_si = []
    batch_a = []
    batch_td = []
    batch_R = []

    # compute and accmulate gradients
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      batch_si.append(si)
      batch_a.append(a)
      batch_td.append(td)
      batch_R.append(R)

    cur_learning_rate = self._anneal_learning_rate(global_t)

    if USE_LSTM:
      batch_si.reverse()
      batch_a.reverse()
      batch_td.reverse()
      batch_R.reverse()

      sess.run( self.apply_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R,
                  self.local_network.initial_lstm_state: start_lstm_state,
                  self.local_network.step_size : [len(batch_a)],
                  self.learning_rate_input: cur_learning_rate } )
    else:
      sess.run( self.apply_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R,
                  self.learning_rate_input: cur_learning_rate} )
      
    if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL):
      self.prev_local_t += PERFORMANCE_LOG_INTERVAL
      elapsed_time = time.time() - self.start_time
      steps_per_sec = global_t / elapsed_time
      print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
        global_t,  elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    with tf.device(device):
      var_refs = [v._ref() for v in self.local_network.get_vars()]
      self.gradients = tf.gradients(
        self.local_network.total_loss, var_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)

    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.gradients )
      
    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    return np.random.choice(range(len(pi_values)), p=pi_values)

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    summary_writer.flush()
    
  def set_start_time(self, start_time):
    self.start_time = start_time

  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t

    if USE_LSTM:
      start_lstm_state = self.local_network.lstm_state_out
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
        print("pi={}".format(pi_))
        print(" V={}".format(value_))

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      # clip reward
      rewards.append( np.clip(reward, -1, 1) )

      self.local_t += 1

      # s_t1 -> s_t
      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print("score={}".format(self.episode_reward))

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        self.episode_reward = 0
        self.game_state.reset()
        if USE_LSTM:
          self.local_network.reset_state()
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    batch_si = []
    batch_a = []
    batch_td = []
    batch_R = []

    # compute and accmulate gradients
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      batch_si.append(si)
      batch_a.append(a)
      batch_td.append(td)
      batch_R.append(R)

    cur_learning_rate = self._anneal_learning_rate(global_t)

    if USE_LSTM:
      batch_si.reverse()
      batch_a.reverse()
      batch_td.reverse()
      batch_R.reverse()

      sess.run( self.apply_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R,
                  self.local_network.initial_lstm_state: start_lstm_state,
                  self.local_network.step_size : [len(batch_a)],
                  self.learning_rate_input: cur_learning_rate } )
    else:
      sess.run( self.apply_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R,
                  self.learning_rate_input: cur_learning_rate} )
      
    if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL):
      self.prev_local_t += PERFORMANCE_LOG_INTERVAL
      elapsed_time = time.time() - self.start_time
      steps_per_sec = global_t / elapsed_time
      print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
        global_t,  elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
Esempio n. 6
0
class A3CTrainingThread(object):
    def __init__(self, thread_index, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_episode,
                 device, arrived_jobs, condition):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_episode = max_global_time_episode

        # 通过thread_index 即机器编号来获取在该机器上加工的所有工序
        self.operations = get_data_by_machine(thread_index)
        self.condition = condition
        self.is_terminal_counted = False
        self.last_episode_reward = 0

        if USE_LSTM:
            # 第一个参数是action size,这里传入在该机器上代加工的工序数
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index,
                                                   device)
        else:
            # 第一个参数是action size,这里传入在该机器上代加工的工序数
            self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index,
                                                 device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

        self.apply_gradients = grad_applier.apply_gradients(
            self.local_network.get_vars(), self.gradients)

        # self.sync = self.local_network.sync_from(global_network)

        # self.game_state = GameState(113 * thread_index)
        # 创建该工序的环境
        self.env = JspEnv(self.operations, thread_index, arrived_jobs)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0

    def _anneal_learning_rate(self, global_time_step):
        # return self.initial_learning_rate
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_episode -
            global_time_step) / self.max_global_time_episode
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values, use_max_choice):
        # if len(self.env.action_space) != 1:
        #   print('\n------------------------------------------------'
        #         'machine = {}'.format(self.thread_index))
        #   print('action space = {}'.format(self.env.action_space))
        #   print('pi = {}'.format(pi_values))
        #
        # for i in range(len(pi_values)):
        #   if i not in self.env.action_space:
        #     pi_values[i] = 0
        # sum = np.sum(pi_values)
        # if sum == 0:
        #   return np.random.choice(self.env.action_space)
        # else:
        #   for i in range(len(pi_values)):
        #     pi_values[i] = pi_values[i] / sum
        #   if use_max_choice:
        #     if len(self.env.action_space) != 1:
        #       pi_values[self.env.machine_size] = 0
        #     return np.argmax(pi_values)
        #   else:
        #     return np.random.choice(range(len(pi_values)), p=pi_values)
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)
        summary_writer.flush()

    def set_start_time(self, start_time):
        self.start_time = start_time

    def process(self, sess, global_t, summary_writer, summary_op, score_input,
                use_max_choice):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # copy weights from shared to local
        # sess.run( self.sync )

        start_local_t = self.local_t

        if USE_LSTM:
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        # for i in range(LOCAL_T_MAX):
        while True:
            # pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.env.local_state)
            action = self.choose_action(pi_, use_max_choice)

            # states.append(self.game_state.s_t)
            states.append(self.env.local_state)
            actions.append(action)
            values.append(value_)

            # if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
            # if (self.thread_index == 0):
            #   print('machine index: ' + str(self.thread_index))
            #   print('arrived jobs:{}'.format(self.env.arrived_jobs[self.thread_index]))
            #   print('actions:{}'.format(action))
            #   print('clock:{}'.format(self.env.clock))
            #   print("action space = {}".format(self.env.action_space))
            #
            #   print("pi={}".format(pi_))
            #   print(" V={}".format(value_))
            '''
      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal
      '''

            new_state, reward, terminal, info = self.env.step(action)

            self.episode_reward += reward

            # clip reward
            # rewards.append( np.clip(reward, -1, 1) )
            rewards.append(reward)

            self.local_t += 1

            # s_t1 -> s_t
            # self.game_state.update()

            if terminal:
                terminal_end = True
                # print("score={}".format(self.episode_reward))
                # print("complete time={}".format(self.env.clock))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward, global_t)

                # print('\n----------------------------------------------------')
                # print('machine index: ' + str(self.thread_index))
                # print('arrived jobs:{}'.format(self.env.arrived_jobs[self.thread_index]))
                # print('actions:{}'.format(action))
                # print('clock:{}'.format(self.env.clock))
                # print("jobs size = {}".format(len(self.env.init_operations)))
                # print("action space = {}".format(self.env.action_space))
                # print("pi={}".format(pi_))
                # print(" V={}".format(value_))
                # print('----------------------------------------------------\n')

                self.complete_time = self.env.clock
                self.last_episode_reward = self.episode_reward
                self.episode_reward = 0
                # self.game_state.reset()
                self.env.reset()
                if USE_LSTM:
                    self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            # R = self.local_network.run_value(sess, self.game_state.s_t)
            R = self.local_network.run_value(sess, self.env.local_state)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            # a = np.zeros([ACTION_SIZE])
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        cur_learning_rate = self._anneal_learning_rate(global_t)

        if USE_LSTM:
            batch_si.reverse()
            batch_a.reverse()
            batch_td.reverse()
            batch_R.reverse()

            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.a: batch_a,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R,
                         self.local_network.initial_lstm_state:
                         start_lstm_state,
                         self.local_network.step_size: [len(batch_a)],
                         self.learning_rate_input: cur_learning_rate
                     })
        else:
            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.a: batch_a,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R,
                         self.learning_rate_input: cur_learning_rate
                     })

        # if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL):
        #   self.prev_local_t += PERFORMANCE_LOG_INTERVAL
        #   elapsed_time = time.time() - self.start_time
        #   steps_per_sec = global_t / elapsed_time
        #   print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
        #     global_t,  elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t, self.complete_time, self.last_episode_reward
class A3CTrainingThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        # STATE_SIZE = 6 - 3 Landmarks + 5 (comm-size)
        self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.gradients)

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0
        self.epSteps = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * \
            (self.max_global_time_step - global_time_step) / \
            self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)
        summary_writer.flush()

    def set_start_time(self, start_time):
        self.start_time = start_time

    def process(self, sess, global_t, summary_writer, summary_op, score_input):
        states = []
        states2 = []
        actions = []
        actions2 = []
        rewards = []
        values = []
        values2 = []

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        # t_max times loop
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, np.concatenate([self.game_state.s1_t, [self.epSteps]]))
            pi2_, value2_ = self.local_network.run_policy_and_value(
                sess, np.concatenate([self.game_state.s2_t, [self.epSteps]]))
            action = self.choose_action(pi_)
            action2 = self.choose_action(pi2_)

            states.append(
                np.concatenate([self.game_state.s1_t, [self.epSteps]]))
            states2.append(
                np.concatenate([self.game_state.s2_t, [self.epSteps]]))

            actions.append(action)
            actions2.append(action2)

            values.append(value_)
            values2.append(value2_)

            # process game
            self.game_state.process([action, action2])

            # receive game result
            reward = self.game_state.reward
            self.episode_reward += reward

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))
                print(" R={}".format(reward))

            # clip reward
            #  rewards.append(np.clip(reward, -1, 1))
            rewards.append(reward)

            self.local_t += 1
            self.epSteps += 1

            # s_t1 -> s_t
            self.game_state.update()

            if self.epSteps >= 100:
                self.epSteps = 0
                if (self.thread_index == 0
                        and self.local_t % LOG_INTERVAL == 0):
                    print("score={}".format(self.episode_reward))

                    self._record_score(sess, summary_writer, summary_op,
                                       score_input, self.episode_reward,
                                       global_t)

                self.episode_reward = 0
                self.game_state.reset()
                self.local_network.reset_state()
                break

        R = 0.0
        R = self.local_network.run_value(
            sess, np.concatenate([self.game_state.s1_t, [self.epSteps]]))
        R2 = self.local_network.run_value(
            sess, np.concatenate([self.game_state.s2_t, [self.epSteps]]))

        actions.reverse()
        actions2.reverse()
        states.reverse()
        states2.reverse()
        rewards.reverse()
        values.reverse()
        values2.reverse()

        batch_si = []
        batch_s2i = []
        batch_a = []
        batch_a2 = []
        batch_td = []
        batch_td2 = []
        batch_R = []
        batch_R2 = []

        # compute and accmulate gradients
        for (ai, a2i, ri, si, s2i, Vi, V2i) in zip(actions, actions2, rewards,
                                                   states, states2, values,
                                                   values2):

            R = ri + GAMMA * R
            R2 = ri + GAMMA * R2
            td = R - Vi
            td2 = R2 - V2i

            a = np.zeros([5])
            a[ai] = 1

            a2 = np.zeros([5])
            a2[a2i] = 1

            batch_si.append(si)
            batch_s2i.append(s2i)
            batch_a.append(a)
            batch_a2.append(a2)
            batch_td.append(td)
            batch_td2.append(td2)
            batch_R.append(R)
            batch_R2.append(R2)

        cur_learning_rate = self._anneal_learning_rate(global_t)

        batch_si.reverse()
        batch_s2i.reverse()
        batch_a.reverse()
        batch_a2.reverse()
        batch_td.reverse()
        batch_td2.reverse()
        batch_R.reverse()
        batch_R2.reverse()

        sess.run(self.apply_gradients,
                 feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R,
                     self.learning_rate_input: cur_learning_rate
                 })

        sess.run(self.apply_gradients,
                 feed_dict={
                     self.local_network.s: batch_s2i,
                     self.local_network.a: batch_a2,
                     self.local_network.td: batch_td2,
                     self.local_network.r: batch_R2,
                     self.learning_rate_input: cur_learning_rate
                 })

        if (self.thread_index == 0) and \
           (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL):
            self.prev_local_t += PERFORMANCE_LOG_INTERVAL
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            print("### Performance : {} STEPS in {:.0f} sec. \
                    {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
                global_t, elapsed_time, steps_per_sec,
                steps_per_sec * 3600 / 1000000.))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t