Exemple #1
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device, training, cooperative, delay_delta):

        self.delay_delta = delay_delta
        logging.info(" ".join(
            map(str,
                ("delay_delta", delay_delta, "cooperative", cooperative))))
        self.training = training
        self.cooperative = cooperative
        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACLSTMNetwork(thread_index, device)
        self.local_network.prepare_loss()

        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

            self.apply_gradients = grad_applier.apply_gradients(
                zip(self.gradients, global_network.get_vars()))

        self.sync = self.local_network.sync_from(global_network)
        self.episode_count = 0

        self.backup_vars = self.local_network.backup_vars()
        self.restore_backup = self.local_network.restore_backup()

        self.initial_learning_rate = initial_learning_rate
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device):
        """
        A3C 算法的 local AC 网络的训练
        :param thread_index:  线程编号,-1 是 全局的 AC 网络
        :param global_network:
        :param initial_learning_rate:
        :param learning_rate_input:
        :param grad_applier: 梯度更新器对象,论文中使用了 RMSProp
        :param max_global_time_step:
        :param device:
        """
        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step
        # 初始化网络的参数
        self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        self.local_network.prepare_loss(ENTROPY_BETA)
        # 需要手机 loss 函数关于各个训练参数?的梯度信息
        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(
                self.local_network.total_loss, var_refs,
                gate_gradients=False,
                aggregation_method=None,
                colocate_gradients_with_ops=False)
        # 更新梯度的 tf 操作
        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(),
            self.gradients)
        # 每一个 local AC 在算法结束的时候需要从 global AC 网络同步参数
        self.sync = self.local_network.sync_from(global_network)
        # 封装游戏
        self.game_state = GameState()
        # 统计 时间步
        self.local_t = 0
        # 各色训练参参数
        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # 控制日志的输出
        self.prev_local_t = 0
Exemple #3
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device, action_size, gamma, local_t_max, entropy_beta,
                 agent_type, performance_log_interval, log_level, random_seed):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.action_size = action_size
        self.gamma = gamma
        self.local_t_max = local_t_max
        self.agent_type = agent_type
        self.performance_log_interval = performance_log_interval
        self.log_level = log_level

        if self.agent_type == 'LSTM':
            self.local_network = GameACLSTMNetwork(self.action_size,
                                                   thread_index, device)
        else:
            self.local_network = GameACFFNetwork(self.action_size,
                                                 thread_index, device)

        self.local_network.prepare_loss(entropy_beta)

        with tf.device(device):
            var_refs = []
            variables = self.local_network.get_vars()
            for v in variables:
                var_refs.append(v)

            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.gradients)

        self.sync = self.local_network.sync_from(global_network)

        np.random.seed(random_seed)
        self.game_state = GameState(random_seed * thread_index,
                                    self.action_size)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate
        self.learn_rate = self.initial_learning_rate

        self.reset_counters()

        self.episode = 0

        # variable controling log output
        self.prev_local_t = 0
Exemple #4
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,task_index=""):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    with tf.device(device):
      var_refs = [v._ref() for v in self.local_network.get_vars()]
      self.gradients = tf.gradients(
        self.local_network.total_loss, var_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)

    if(global_network):
      self.apply_gradients = grad_applier.apply_gradients(
        global_network.get_vars(),
        self.gradients )
      self.sync = self.local_network.sync_from(global_network)
      self.mode="threading";
    else:
      self.apply_gradients = grad_applier.apply_gradients(
        self.local_network.get_vars(),
        self.gradients )
      self.mode="dist_tensor";
    if not (task_index): 
      self.game_state = GameState(113 * thread_index)
    else:
      self.game_state = GameState(113 * task_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0
    def __init__(self, thread_index, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_episode,
                 device, arrived_jobs, condition):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_episode = max_global_time_episode

        # 通过thread_index 即机器编号来获取在该机器上加工的所有工序
        self.operations = get_data_by_machine(thread_index)
        self.condition = condition
        self.is_terminal_counted = False
        self.last_episode_reward = 0

        if USE_LSTM:
            # 第一个参数是action size,这里传入在该机器上代加工的工序数
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index,
                                                   device)
        else:
            # 第一个参数是action size,这里传入在该机器上代加工的工序数
            self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index,
                                                 device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

        self.apply_gradients = grad_applier.apply_gradients(
            self.local_network.get_vars(), self.gradients)

        # self.sync = self.local_network.sync_from(global_network)

        # self.game_state = GameState(113 * thread_index)
        # 创建该工序的环境
        self.env = JspEnv(self.operations, thread_index, arrived_jobs)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    with tf.device(device):
      var_refs = [v.ref() for v in self.local_network.get_vars()]
      self.gradients = tf.gradients(
        self.local_network.total_loss, var_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)

    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.gradients )

    self.sync = self.local_network.sync_from(global_network)

    self.game_state = GameState(113 * thread_index)

    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0

    tempdir = os.path.join(os.getcwd(), "results")
    self.res_file = os.path.join(tempdir, RESULTS_FILE)
    file = open(self.res_file, 'wb')
    file.write('itr,mean_score,max,min,std,runs,test_steps\n')
    file.close()
Exemple #7
0
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if NETWORK_TYPE == 'LSTM':
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        elif NETWORK_TYPE == 'DILATED':
            self.local_network = GameACDilatedNetwork(ACTION_SIZE, device)
        elif NETWORK_TYPE == 'CONV':
            self.local_network = GameACFFNetwork(ACTION_SIZE, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        # TODO: don't need accum trainer anymore with batch
        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize( self.local_network.total_loss,
                                       self.local_network.get_vars() )

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
          global_network.get_vars(),
          self.trainer.get_accum_grad_list() )

        self.sync = self.local_network.sync_from(global_network)




        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
Exemple #8
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients( # watch out: update global_network
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0
class A3CTrainingThread(object):

    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device):
        """
        A3C 算法的 local AC 网络的训练
        :param thread_index:  线程编号,-1 是 全局的 AC 网络
        :param global_network:
        :param initial_learning_rate:
        :param learning_rate_input:
        :param grad_applier: 梯度更新器对象,论文中使用了 RMSProp
        :param max_global_time_step:
        :param device:
        """
        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step
        # 初始化网络的参数
        self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        self.local_network.prepare_loss(ENTROPY_BETA)
        # 需要手机 loss 函数关于各个训练参数?的梯度信息
        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(
                self.local_network.total_loss, var_refs,
                gate_gradients=False,
                aggregation_method=None,
                colocate_gradients_with_ops=False)
        # 更新梯度的 tf 操作
        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(),
            self.gradients)
        # 每一个 local AC 在算法结束的时候需要从 global AC 网络同步参数
        self.sync = self.local_network.sync_from(global_network)
        # 封装游戏
        self.game_state = GameState()
        # 统计 时间步
        self.local_t = 0
        # 各色训练参参数
        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # 控制日志的输出
        self.prev_local_t = 0

    def _anneal_learning_rate(self, global_time_step):
        """
        递减学习率,主要是防止在 loss 的最小值的地方来回的震荡
        :param global_time_step: 已经玩的时间
        :return:
        """
        learning_rate = self.initial_learning_rate * (
                    self.max_global_time_step - global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        """
        这个是 epsilon-greedy, 需要指定输出行为的分布
        :param pi_values: 获得的策略
        :return: 返回一个动作
        """
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def set_start_time(self, start_time):
        """
        设置开始时间
        :param start_time:
        :return:
        """
        self.start_time = start_time

    def process(self,
                sess,
                global_t,
                summary_writer,
                summary_op,
                learning_rate_input,
                score_input):
        """
        开始 local AC 网络的训练过程
        :param sess:
        :param global_t:
        :param summary_writer:
        :param learning_rate_input:
        :param score_input:
        :return:
        """
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # 从全局的 AC 网络中获取参数
        sess.run(self.sync)

        start_local_t = self.local_t

        start_lstm_state = self.local_network.lstm_state_out

        # 必须规定 local AC 网络最大的时间步,T_{max}
        for i in range(LOCAL_T_MAX):
            # 已经的到了游戏的当前状态以及更新后的
            pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
            action = self.choose_action(pi_)
            # 保存累计的信息
            states.append(self.game_state.s_t)
            actions.append(action)
            values.append(value_)
            # 只有 第一个 local AC 网络需要在合适的时候输出日志
            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            # 执行动作
            self.game_state.process(action)

            # 获取游戏的回报,这个封装了连接 连续4帧图像的过程
            reward = self.game_state.reward
            terminal = self.game_state.terminal
            # 累计的报答信息
            self.episode_reward += reward

            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # 状态更新
            self.game_state.update()
            # 在附录的 Algorithm 3 中
            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward))

                self.game_state.reset()
                # LSTM 的传递的装套重置
                self.local_network.reset_state()
                break
        # 分类讨论,计算的是 discounted 的 Rewards
        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.game_state.s_t)  # 在最后的一个状态开始自举

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        # 这是 MDP 的四元组形式 在论文中,时间步是反的
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            #
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        cur_learning_rate = self._anneal_learning_rate(global_t)
        if terminal_end:
            #
            summary_str = sess.run(summary_op, feed_dict={
                score_input: self.episode_reward,
                learning_rate_input: cur_learning_rate
            })
            summary_writer.add_summary(summary_str, global_t)
            summary_writer.flush()
            # 修改记录
            self.episode_reward = 0
        batch_si.reverse()
        batch_a.reverse()
        batch_td.reverse()
        batch_R.reverse()

        sess.run(self.apply_gradients,
                 feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R,
                     self.local_network.initial_lstm_state: start_lstm_state,
                     self.local_network.step_size: [len(batch_a)],
                     self.learning_rate_input: cur_learning_rate})

        # 计算 wall clock time, 在论文第6页
        if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL):
            self.prev_local_t += PERFORMANCE_LOG_INTERVAL
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
                global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

        # 进行 LOCAL_TIME_MAX 次采样的时间差
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Exemple #10
0
def display(experiment_name, rmsp_alpha, rmsp_epsilon, grad_norm_clip,
            agent_type, action_size, rand_seed, checkpoint_dir,
            display_time_sleep, display_episodes, display_log_level,
            display_save_log, show_max):

    # use CPU for display tool
    device = "/cpu:0"

    LOG_FILE = 'log_{}-{}.txt'.format(experiment_name, agent_type)

    if agent_type == 'LSTM':
        global_network = GameACLSTMNetwork(action_size, -1, device)
    else:
        global_network = GameACFFNetwork(action_size, -1, device)

    learning_rate_input = tf.placeholder("float")

    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=rmsp_alpha,
                                  momentum=0.0,
                                  epsilon=rmsp_epsilon,
                                  clip_norm=grad_norm_clip,
                                  device=device)

    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)

    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")

    episode = 0
    terminal = False

    episode_rewards = []
    episode_steps = []
    episode_passed_obsts = []
    print ' '
    print 'DISPLAYING {} EPISODES'.format(display_episodes)
    print '--------------------------------------------------- '

    while not episode == display_episodes:
        episode_reward = 0
        episode_passed_obst = 0

        game_state = GameState(rand_seed, action_size, show_score=True)

        if display_log_level == 'FULL':
            print 'EPISODE {}'.format(episode)

        full_frame = None
        while True:
            pi_values, value = global_network.run_policy_and_value(
                sess, game_state.s_t)
            action = choose_action(pi_values)
            game_state.process(action)
            terminal = game_state.terminal
            episode_step = game_state.steps
            reward = game_state.reward
            passed_obst = game_state.passed_obst
            if len(episode_passed_obsts) == 0 and show_max:
                if passed_obst > 0:
                    full_frame = game_state.full_frame
            elif episode_passed_obst > np.max(
                    episode_passed_obsts) and show_max:
                full_frame = game_state.full_frame

            episode_reward += reward
            episode_passed_obst = passed_obst

            if display_log_level == 'FULL':
                print 'step  /  pi_values: {}  /  value: {}  /  action: {}  /  reward: {}  /  passed_obst: {}'.format(
                    pi_values, value, action, reward, passed_obst)

            time.sleep(display_time_sleep)

            if not terminal:
                game_state.update()
            else:
                break

        episode_rewards.append(episode_reward)
        episode_steps.append(episode_step)
        episode_passed_obsts.append(episode_passed_obst)

        if not display_log_level == 'NONE':
            reward_steps = format(
                float(episode_reward) / float(episode_step), '.4f')
            print "EPISODE: {}  /  STEPS: {}  /  PASSED OBST: {}  /  REWARD: {}  /  REWARD/STEP: {}".format(
                episode, episode_step, passed_obst, episode_reward,
                reward_steps)

        if display_save_log:
            with open(LOG_FILE, "a") as text_file:
                text_file.write('{},{},{},{},{}\n'.format(
                    episode, episode_step, passed_obst, episode_reward,
                    reward_steps))

        episode += 1

    print '--------------------------------------------------- '
    print 'DISPLAY SESSION FINISHED'
    print 'TOTAL EPISODES: {}'.format(display_episodes)
    print ' '
    print 'MIN'
    print 'REWARD: {}  /  STEPS: {}  /  PASSED OBST: {}'.format(
        np.min(episode_rewards), np.min(episode_steps),
        np.min(episode_passed_obsts))
    print ' '
    print 'AVERAGE'
    print 'REWARD: {}  /  STEPS: {}  /  PASSED OBST: {}'.format(
        np.average(episode_rewards), np.average(episode_steps),
        np.average(episode_passed_obsts))
    print ' '
    print 'MAX'
    print 'REWARD: {}  /   STEPS: {}  /   PASSED OBST: {}'.format(
        np.max(episode_rewards), np.max(episode_steps),
        np.max(episode_passed_obsts))

    if show_max and not full_frame == None:
        plt.imshow(full_frame, origin='lower')
        plt.show()
Exemple #11
0
def run_a3c_test(args):
    """Run A3C testing."""
    GYM_ENV_NAME = args.gym_env.replace('-', '_')

    if args.use_gpu:
        assert args.cuda_devices != ''
        os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
    import tensorflow as tf

    if not os.path.exists('results/a3c'):
        os.makedirs('results/a3c')

    if args.folder is not None:
        folder = args.folder
    else:
        folder = 'results/a3c/{}'.format(GYM_ENV_NAME)
        end_str = ''

        if args.use_mnih_2015:
            end_str += '_mnih2015'
        if args.use_lstm:
            end_str += '_lstm'
        if args.unclipped_reward:
            end_str += '_rawreward'
        elif args.log_scale_reward:
            end_str += '_logreward'
        if args.transformed_bellman:
            end_str += '_transformedbell'

        if args.use_transfer:
            end_str += '_transfer'
            if args.not_transfer_conv2:
                end_str += '_noconv2'
            elif args.not_transfer_conv3 and args.use_mnih_2015:
                end_str += '_noconv3'
            elif args.not_transfer_fc1:
                end_str += '_nofc1'
            elif args.not_transfer_fc2:
                end_str += '_nofc2'
        if args.finetune_upper_layers_only:
            end_str += '_tune_upperlayers'
        if args.train_with_demo_num_steps > 0 \
           or args.train_with_demo_num_epochs > 0:
            end_str += '_pretrain_ina3c'
        if args.use_demo_threads:
            end_str += '_demothreads'

        if args.load_pretrained_model:
            if args.use_pretrained_model_as_advice:
                end_str += '_modelasadvice'
            if args.use_pretrained_model_as_reward_shaping:
                end_str += '_modelasshaping'

        if args.padding == 'SAME':
            end_str += '_same'

        folder += end_str

    folder = pathlib.Path(folder)

    demo_memory_cam = None
    demo_cam_human = False
    if args.load_demo_cam:
        if args.demo_memory_folder is not None:
            demo_memory_folder = args.demo_memory_folder
        else:
            demo_memory_folder = 'collected_demo/{}'.format(GYM_ENV_NAME)

        demo_memory_folder = pathlib.Path(demo_memory_folder)

        if args.demo_cam_id is not None:
            demo_cam_human = True
            demo_cam, _, total_rewards_cam, _ = load_memory(
                name=None,
                demo_memory_folder=demo_memory_folder,
                demo_ids=args.demo_cam_id,
                imgs_normalized=False)

            demo_cam = demo_cam[int(args.demo_cam_id)]
            logger.info("loaded demo {} for testing CAM".format(
                args.demo_cam_id))

        else:
            demo_cam_folder = pathlib.Path(args.demo_cam_folder)
            demo_cam = ReplayMemory()
            demo_cam.load(name='test_cam', folder=demo_cam_folder)
            logger.info("loaded demo {} for testing CAM".format(
                str(demo_cam_folder / 'test_cam')))

        demo_memory_cam = np.zeros(
            (len(demo_cam),
             demo_cam.height,
             demo_cam.width,
             demo_cam.phi_length),
            dtype=np.float32)

        for i in range(len(demo_cam)):
            s0, _, _, _, _, _, t1, _ = demo_cam[i]
            demo_memory_cam[i] = np.copy(s0)

        del demo_cam

    device = "/cpu:0"
    gpu_options = None
    if args.use_gpu:
        device = "/gpu:"+os.environ["CUDA_VISIBLE_DEVICES"]
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=args.gpu_fraction)

    initial_learning_rate = args.initial_learn_rate
    logger.info('Initial Learning Rate={}'.format(initial_learning_rate))
    time.sleep(2)

    global_t = 0
    stop_requested = False

    game_state = GameState(env_id=args.gym_env)
    action_size = game_state.env.action_space.n

    config = tf.ConfigProto(
        gpu_options=gpu_options,
        log_device_placement=False,
        allow_soft_placement=True)

    input_shape = (84, 84, 4) if args.padding == 'VALID' else (88, 88, 4)
    if args.use_lstm:
        GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015
        global_network = GameACLSTMNetwork(action_size, -1, device)
    else:
        GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015
        global_network = GameACFFNetwork(
            action_size, -1, device, padding=args.padding,
            in_shape=input_shape)

    learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr")

    grad_applier = tf.train.RMSPropOptimizer(
        learning_rate=learning_rate_input,
        decay=args.rmsp_alpha,
        epsilon=args.rmsp_epsilon)

    A3CTrainingThread.log_interval = args.log_interval
    A3CTrainingThread.performance_log_interval = args.performance_log_interval
    A3CTrainingThread.local_t_max = args.local_t_max
    A3CTrainingThread.demo_t_max = args.demo_t_max
    A3CTrainingThread.use_lstm = args.use_lstm
    A3CTrainingThread.action_size = action_size
    A3CTrainingThread.entropy_beta = args.entropy_beta
    A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta
    A3CTrainingThread.gamma = args.gamma
    A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015
    A3CTrainingThread.env_id = args.gym_env
    A3CTrainingThread.finetune_upper_layers_only = \
        args.finetune_upper_layers_only
    A3CTrainingThread.transformed_bellman = args.transformed_bellman
    A3CTrainingThread.clip_norm = args.grad_norm_clip
    A3CTrainingThread.use_grad_cam = args.use_grad_cam

    if args.unclipped_reward:
        A3CTrainingThread.reward_type = "RAW"
    elif args.log_scale_reward:
        A3CTrainingThread.reward_type = "LOG"
    else:
        A3CTrainingThread.reward_type = "CLIP"

    if args.use_lstm:
        local_network = GameACLSTMNetwork(action_size, 0, device)
    else:
        local_network = GameACFFNetwork(
            action_size, 0, device, padding=args.padding,
            in_shape=input_shape)

    testing_thread = A3CTrainingThread(
        0, global_network, local_network, initial_learning_rate,
        learning_rate_input,
        grad_applier, 0,
        device=device)

    # prepare session
    sess = tf.Session(config=config)

    if args.use_transfer:
        if args.transfer_folder is not None:
            transfer_folder = args.transfer_folder
        else:
            transfer_folder = 'results/pretrain_models/{}'.format(GYM_ENV_NAME)
            end_str = ''

            if args.use_mnih_2015:
                end_str += '_mnih2015'
            end_str += '_l2beta1E-04_batchprop'  # TODO: make this an argument
            transfer_folder += end_str

        transfer_folder = pathlib.Path(transfer_folder)
        transfer_folder /= 'transfer_model'

        if args.not_transfer_conv2:
            transfer_var_list = [
                global_network.W_conv1,
                global_network.b_conv1,
                ]

        elif (args.not_transfer_conv3 and args.use_mnih_2015):
            transfer_var_list = [
                global_network.W_conv1,
                global_network.b_conv1,
                global_network.W_conv2,
                global_network.b_conv2,
                ]

        elif args.not_transfer_fc1:
            transfer_var_list = [
                global_network.W_conv1,
                global_network.b_conv1,
                global_network.W_conv2,
                global_network.b_conv2,
                ]

            if args.use_mnih_2015:
                transfer_var_list += [
                    global_network.W_conv3,
                    global_network.b_conv3,
                    ]

        elif args.not_transfer_fc2:
            transfer_var_list = [
                global_network.W_conv1,
                global_network.b_conv1,
                global_network.W_conv2,
                global_network.b_conv2,
                global_network.W_fc1,
                global_network.b_fc1,
                ]

            if args.use_mnih_2015:
                transfer_var_list += [
                    global_network.W_conv3,
                    global_network.b_conv3,
                    ]

        else:
            transfer_var_list = [
                global_network.W_conv1,
                global_network.b_conv1,
                global_network.W_conv2,
                global_network.b_conv2,
                global_network.W_fc1,
                global_network.b_fc1,
                global_network.W_fc2,
                global_network.b_fc2,
                ]

            if args.use_mnih_2015:
                transfer_var_list += [
                    global_network.W_conv3,
                    global_network.b_conv3,
                    ]

        global_network.load_transfer_model(
            sess, folder=transfer_folder,
            not_transfer_fc2=args.not_transfer_fc2,
            not_transfer_fc1=args.not_transfer_fc1,
            not_transfer_conv3=(args.not_transfer_conv3
                                and args.use_mnih_2015),
            not_transfer_conv2=args.not_transfer_conv2,
            var_list=transfer_var_list,
            )

    def initialize_uninitialized(sess):
        global_vars = tf.global_variables()
        is_not_initialized = sess.run(
            [tf.is_variable_initialized(var) for var in global_vars])
        not_initialized_vars = [
            v for (v, f) in zip(global_vars, is_not_initialized) if not f]

        if len(not_initialized_vars):
            sess.run(tf.variables_initializer(not_initialized_vars))

    if args.use_transfer:
        initialize_uninitialized(sess)
    else:
        sess.run(tf.global_variables_initializer())

    # init or load checkpoint with saver
    root_saver = tf.train.Saver(max_to_keep=1)
    checkpoint = tf.train.get_checkpoint_state(str(folder))
    if checkpoint and checkpoint.model_checkpoint_path:
        root_saver.restore(sess, checkpoint.model_checkpoint_path)
        logger.info("checkpoint loaded:{}".format(
            checkpoint.model_checkpoint_path))
        tokens = checkpoint.model_checkpoint_path.split("-")
        # set global step
        global_t = int(tokens[-1])
        logger.info(">>> global step set: {}".format(global_t))
    else:
        logger.warning("Could not find old checkpoint")

    def test_function():
        nonlocal global_t

        if args.use_transfer:
            from_folder = str(transfer_folder).split('/')[-2]
        else:
            from_folder = str(folder).split('/')[-1]

        from_folder = pathlib.Path(from_folder)
        save_folder = 'results/test_model/a3c' / from_folder
        prepare_dir(str(save_folder), empty=False)
        prepare_dir(str(save_folder / 'frames'), empty=False)

        # Evaluate model before training
        if not stop_requested:
            testing_thread.testing_model(
                sess, args.eval_max_steps, global_t, save_folder,
                demo_memory_cam=demo_memory_cam, demo_cam_human=demo_cam_human)

    def signal_handler(signal, frame):
        nonlocal stop_requested
        logger.info('You pressed Ctrl+C!')
        stop_requested = True

        if stop_requested and global_t == 0:
            sys.exit(1)

    test_thread = threading.Thread(target=test_function, args=())

    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    test_thread.start()

    print('Press Ctrl+C to stop')

    test_thread.join()

    sess.close()
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               options):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step
    self.options = options

    if options.use_lstm:
      self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(options.action_size, device)

    self.local_network.prepare_loss(options.entropy_beta)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    self.indent = "         |" * self.thread_index
    self.steps = 0
    self.no_reward_steps = 0
    self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0)

    if self.options.train_episode_steps > 0:
      self.max_reward = 0.0
      self.max_episode_reward = 0.0
      self.episode_states = []
      self.episode_actions = []
      self.episode_rewards = []
      self.episode_values = []
      self.episode_liveses = []
      self.episode_scores = Episode_scores(options)
      self.tes = self.options.train_episode_steps
      if self.options.tes_list is not None:
        self.tes = self.options.tes_list[thread_index]
        print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes))
    self.initial_lives = self.game_state.initial_lives
    self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1)

    if self.options.record_new_record_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_record_dir):
          os.makedirs(self.options.record_new_record_dir)
      self.episode_screens = []

    if self.options.record_new_room_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_room_dir):
          os.makedirs(self.options.record_new_room_dir)
      self.episode_screens = []

    self.greediness = options.greediness
    self.repeat_action_ratio = options.repeat_action_ratio
    self.prev_action = 0

device = "/cpu:0"
if USE_GPU:
    device = "/gpu:0"

initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH,
                                    INITIAL_ALPHA_LOG_RATE)

global_t = 0

stop_requested = False

global_game = DoomGameState(scenario_path="scenarios/cig.cfg")
if USE_LSTM:
    global_network = GameACLSTMNetwork(global_game.get_action_size(), -1,
                                       device)
else:
    global_network = GameACFFNetwork(global_game.get_action_size(), -1, device)
del global_game

training_threads = []

learning_rate_input = tf.placeholder("float")

grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                              decay=RMSP_ALPHA,
                              momentum=0.0,
                              epsilon=RMSP_EPSILON,
                              clip_norm=GRAD_NORM_CLIP,
                              device=device)
# -*- coding: utf-8 -*-
import tensorflow as tf
import matplotlib.pyplot as plt

from game_ac_network import GameACFFNetwork, GameACLSTMNetwork
from a3c_training_thread import A3CTrainingThread
from rmsprop_applier import RMSPropApplier

import options
options = options.options

# use CPU for weight visualize tool
device = "/cpu:0"

if options.use_lstm:
  global_network = GameACLSTMNetwork(options.action_size, -1, device)
else:
  global_network = GameACFFNetwork(options.action_size, device)

training_threads = []

learning_rate_input = tf.placeholder("float")

grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                              decay = options.rmsp_alpha,
                              momentum = 0.0,
                              epsilon = options.rmsp_epsilon,
                              clip_norm = options.grad_norm_clip,
                              device = device)

sess = tf.Session()
Exemple #15
0
class A3CTrainingThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, 11,
                                               device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.gradients)

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0
        self.epSteps = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * \
            (self.max_global_time_step - global_time_step) / \
            self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)
        summary_writer.flush()

    def set_start_time(self, start_time):
        self.start_time = start_time

    def process(self, sess, global_t, summary_writer, summary_op, score_input):
        states = []
        states2 = []
        actions = []
        comms = []
        rewards = []
        values = []

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        for i in range(LOCAL_T_MAX):
            pi_, comm_, value_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t, self.game_state.s2)
            action = self.choose_action(pi_)
            comm = self.choose_action(comm_)

            states.append(self.game_state.s_t)
            states2.append(self.game_state.s2)
            actions.append(action)
            comms.append(comm)
            values.append(value_)

            # process game
            self.game_state.process(action, comm)

            # receive game result
            reward = self.game_state.reward
            self.episode_reward += reward

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))
                print(" R={}".format(reward))

            # clip reward
            #  rewards.append(np.clip(reward, -1, 1))
            rewards.append(reward)

            self.local_t += 1
            self.epSteps += 1

            # s_t1 -> s_t
            self.game_state.update()

            if self.epSteps >= 100:
                self.epSteps = 0
                if (self.thread_index == 0
                        and self.local_t % LOG_INTERVAL == 0):
                    print("score={}".format(self.episode_reward))

                    self._record_score(sess, summary_writer, summary_op,
                                       score_input, self.episode_reward,
                                       global_t)

                self.episode_reward = 0
                self.game_state.reset()
                self.local_network.reset_state()
                break

        R = 0.0
        R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        states2.reverse()
        rewards.reverse()
        values.reverse()
        comms.reverse()

        batch_si = []
        batch_s2 = []
        batch_a = []
        batch_c = []
        batch_td = []
        batch_R = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi, ci, s2i) in zip(actions, rewards, states, values,
                                             comms, states2):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            c = np.zeros([5])
            c[ci] = 1

            batch_si.append(si)
            batch_s2.append(s2i)
            batch_a.append(a)
            batch_c.append(c)
            batch_td.append(td)
            batch_R.append(R)

        cur_learning_rate = self._anneal_learning_rate(global_t)

        batch_si.reverse()
        batch_s2.reverse()
        batch_a.reverse()
        batch_c.reverse()
        batch_td.reverse()
        batch_R.reverse()

        sess.run(self.apply_gradients,
                 feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.comm: batch_c,
                     self.local_network.s2: batch_s2,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R,
                     self.local_network.initial_lstm_state: start_lstm_state,
                     self.local_network.step_size: [len(batch_a)],
                     self.learning_rate_input: cur_learning_rate
                 })

        if (self.thread_index == 0) and \
           (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL):
            self.prev_local_t += PERFORMANCE_LOG_INTERVAL
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            print("### Performance : {} STEPS in {:.0f} sec. \
                    {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
                global_t, elapsed_time, steps_per_sec,
                steps_per_sec * 3600 / 1000000.))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Exemple #16
0
from constants import ACTION_SIZE

from constants import CHECKPOINT_DIR
from constants import RMSP_EPSILON
from constants import RMSP_ALPHA
from constants import GRAD_NORM_CLIP


def choose_action(pi_values):
    return np.random.choice(range(len(pi_values)), p=pi_values)


# 使用 CPU,可以边训练边检查
device = "/cpu:0"

global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device)

learning_rate_input = tf.placeholder("float")

grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                              decay=RMSP_ALPHA,
                              momentum=0.0,
                              epsilon=RMSP_EPSILON,
                              clip_norm=GRAD_NORM_CLIP,
                              device=device)

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

saver = tf.train.Saver()
Exemple #17
0
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device=None,
                 pretrained_model=None,
                 pretrained_model_sess=None,
                 advice=False,
                 reward_shaping=False):
        assert self.action_size != -1

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step
        self.use_pretrained_model_as_advice = advice
        self.use_pretrained_model_as_reward_shaping = reward_shaping

        logger.info("thread_index: {}".format(self.thread_index))
        logger.info("local_t_max: {}".format(self.local_t_max))
        logger.info("use_lstm: {}".format(
            colored(self.use_lstm, "green" if self.use_lstm else "red")))
        logger.info("action_size: {}".format(self.action_size))
        logger.info("entropy_beta: {}".format(self.entropy_beta))
        logger.info("gamma: {}".format(self.gamma))
        logger.info("reward_type: {}".format(self.reward_type))
        logger.info("finetune_upper_layers_only: {}".format(
            colored(self.finetune_upper_layers_only,
                    "green" if self.finetune_upper_layers_only else "red")))
        logger.info("use_pretrained_model_as_advice: {}".format(
            colored(
                self.use_pretrained_model_as_advice,
                "green" if self.use_pretrained_model_as_advice else "red")))
        logger.info("use_pretrained_model_as_reward_shaping: {}".format(
            colored(
                self.use_pretrained_model_as_reward_shaping, "green"
                if self.use_pretrained_model_as_reward_shaping else "red")))
        logger.info("transformed_bellman: {}".format(
            colored(self.transformed_bellman,
                    "green" if self.transformed_bellman else "red")))
        logger.info("clip_norm: {}".format(self.clip_norm))
        logger.info("use_grad_cam: {}".format(
            colored(self.use_grad_cam,
                    "green" if self.use_grad_cam else "red")))

        if self.use_lstm:
            GameACLSTMNetwork.use_mnih_2015 = self.use_mnih_2015
            self.local_network = GameACLSTMNetwork(self.action_size,
                                                   thread_index, device)
        else:
            GameACFFNetwork.use_mnih_2015 = self.use_mnih_2015
            self.local_network = GameACFFNetwork(self.action_size,
                                                 thread_index, device)

        with tf.device(device):
            self.local_network.prepare_loss(entropy_beta=self.entropy_beta,
                                            critic_lr=0.5)
            local_vars = self.local_network.get_vars
            if self.finetune_upper_layers_only:
                local_vars = self.local_network.get_vars_upper
            var_refs = [v._ref() for v in local_vars()]

            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs)

        global_vars = global_network.get_vars
        if self.finetune_upper_layers_only:
            global_vars = global_network.get_vars_upper

        with tf.device(device):
            if self.clip_norm is not None:
                self.gradients, grad_norm = tf.clip_by_global_norm(
                    self.gradients, self.clip_norm)
            self.gradients = list(zip(self.gradients, global_vars()))
            self.apply_gradients = grad_applier.apply_gradients(self.gradients)

            #self.apply_gradients = grad_applier.apply_gradients(
            #    global_vars(),
            #    self.gradients)

        self.sync = self.local_network.sync_from(
            global_network, upper_layers_only=self.finetune_upper_layers_only)

        self.game_state = GameState(env_id=self.env_id,
                                    display=False,
                                    no_op_max=30,
                                    human_demo=False,
                                    episode_life=True)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
        self.episode_steps = 0

        # variable controlling log output
        self.prev_local_t = 0

        self.is_demo_thread = False

        with tf.device(device):
            if self.use_grad_cam:
                self.action_meaning = self.game_state.env.unwrapped.get_action_meanings(
                )
                self.local_network.build_grad_cam_grads()

        self.pretrained_model = pretrained_model
        self.pretrained_model_sess = pretrained_model_sess
        self.psi = 0.9 if self.use_pretrained_model_as_advice else 0.0
        self.advice_ctr = 0
        self.shaping_ctr = 0
        self.last_rho = 0.

        if self.use_pretrained_model_as_advice or self.use_pretrained_model_as_reward_shaping:
            assert self.pretrained_model is not None
Exemple #18
0
from game_ac_network import GameACLSTMNetwork

from constants import ACTION_SIZE
from constants import CHECKPOINT_DIR
from constants import USE_GPU
from game_state import GameState

device = "/cpu:0"
if USE_GPU:
    device = "/gpu:0"

global_t = 0

stop_requested = False

global_network = GameACLSTMNetwork(ACTION_SIZE, -1, 11, device)

training_threads = []

# prepare session
sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                        allow_soft_placement=True))

init = tf.global_variables_initializer()
sess.run(init)

# summary for tensorboard


def choose_action(pi_values):
    return np.random.choice(range(len(pi_values)), p=pi_values)
def make_network():
    if USE_LSTM:
        return GameACLSTMNetwork(ACTION_SIZE, -1, device)
    else:
        return GameACFFNetwork(ACTION_SIZE, device)
Exemple #20
0
class A3CTrainingThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device, training, cooperative, delay_delta):

        self.delay_delta = delay_delta
        logging.info(" ".join(
            map(str,
                ("delay_delta", delay_delta, "cooperative", cooperative))))
        self.training = training
        self.cooperative = cooperative
        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACLSTMNetwork(thread_index, device)
        self.local_network.prepare_loss()

        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

            self.apply_gradients = grad_applier.apply_gradients(
                zip(self.gradients, global_network.get_vars()))

        self.sync = self.local_network.sync_from(global_network)
        self.episode_count = 0

        self.backup_vars = self.local_network.backup_vars()
        self.restore_backup = self.local_network.restore_backup()

        self.initial_learning_rate = initial_learning_rate

    def inverse_sigmoid(self, x):
        return (1 + math.exp(-self.delay_delta)) / (
            1 + math.exp(SIGMOID_ALPHA * (x - self.delay_delta)))

    def reset_state_and_reinitialize(self, sess):
        self.local_network.reset_state()
        # action, value_ = self.local_network.run_action_and_value(sess, [0.0]*STATE_SIZE)

    def get_network_vars(self):
        return self.local_network.get_vars()

    def _anneal_learning_rate(self, global_time_step):
        # learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
        # if learning_rate < 0.0:
        #   learning_rate = 0.0
        # return learning_rate
        return self.initial_learning_rate

    def _record_score(self, sess, summary_writer, summary_op, summary_inputs,
                      things, global_t):
        # print("window in _record_score", self.windows, self.time_differences)
        feed_dict = {}
        for key in things.keys():
            feed_dict[summary_inputs[key]] = things[key]
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        summary_writer.add_summary(summary_str, global_t)
        summary_writer.flush()

    def start_anew(self):
        # print("self.windows", self.windows)
        assert (len(self.windows) > 0)
        current_index = 0
        while current_index < len(self.windows):
            # print("current_index", current_index)
            if current_index + math.floor(
                    A3CTrainingThread.get_actual_window(
                        self.windows[current_index] +
                        self.actions[current_index])) == len(self.windows):
                return True
            current_index = current_index + math.floor(
                A3CTrainingThread.get_actual_window(
                    self.windows[current_index] + self.actions[current_index]))
            # print("current_index afterwards", current_index)

        return False

    def start_anew_index(self):
        assert (len(self.windows) > 0)
        current_index = 0
        while current_index < len(self.windows):
            current_index = current_index + math.floor(
                A3CTrainingThread.get_actual_window(
                    self.windows[current_index] + self.actions[current_index]))
        # print("current_index in start_anew", current_index)
        return current_index

    def action_step(self, sess, state, tickno, window):
        # print(self.thread_index, "in action")
        # Run this still with the old weights, before syncing them
        # print("state", state)
        assert (np.all(np.isfinite(np.array(state, dtype=np.float32))))

        # print(self.thread_index, "state", state)
        if self.training:
            self.estimated_values.append(
                self.local_network.run_value(sess, state))

            # if len(self.actions) % LOCAL_T_MAX == 0:
            if not (len(self.start_lstm_states) == 0) == (len(self.actions)
                                                          == 0):
                print("Oh no, something went pretty wrong:",
                      self.start_lstm_states, self.actions)
            assert ((len(self.start_lstm_states) == 0) == (len(
                self.actions) == 0))
            if ('LOCAL_T_MAX' in globals()
                    and len(self.actions) % globals()["LOCAL_T_MAX"]
                    == 0) or (not 'LOCAL_T_MAX' in globals() and
                              (len(self.actions) == 0 or self.start_anew())):
                # print("Starting new period")
                self.time_differences.append(None)
                # Sync for the next iteration
                sess.run(self.sync)
                self.start_lstm_states.append(
                    (self.local_network.lstm_state_out_action,
                     self.local_network.lstm_state_out_value,
                     self.local_network.lstm_state_out_duration))
                self.variable_snapshots.append(
                    sess.run(self.local_network.get_vars()))

        if self.training:
            action, value_ = self.local_network.run_action_and_value(
                sess, state)
        else:
            assert (False)
            action = self.local_network.run_action(sess, state)

        # logging.debug(" ".join(map(str,(self.thread_index,"pi_values:",pi_))))

        if self.training:
            self.states.append(state)
            self.ticknos.append(tickno)
            self.windows.append(window)
            self.actions.append(action)
            self.values.append(value_)
        # if self.local_t % LOG_INTERVAL == 0:
        #   logging.debug("{}: pi={}".format(self.thread_index, pi_))
        #   logging.debug("{}: V={}".format(self.thread_index, value_))
        # print(self.thread_index, action[0])
        return action

    def reward_step(self, sess, global_t, summary_writer, summary_op,
                    summary_inputs, reward_throughput, reward_delay, duration,
                    sent):
        # print(self.thread_index, "in reward")
        assert (reward_throughput >= 0)
        assert (reward_delay >= 0)
        # print("duration", duration)
        assert (duration >= 0)
        assert (sent >= 0)

        assert (len(self.rewards) <= 2 * MAX_WINDOW)

        self.rewards.append((reward_throughput, reward_delay, duration, sent))

        # if len(self.rewards)>=LOCAL_T_MAX or (len([item for item in self.actions[:LOCAL_T_MAX] if item is not None]) == len(self.rewards) and len(self.rewards) > 0 and self.time_differences[0] is not None):
        if ('LOCAL_T_MAX' in globals() and
            (len(self.rewards) >= globals()["LOCAL_T_MAX"] or
             (len([
                 item for item in self.actions[:globals()["LOCAL_T_MAX"]]
                 if item is not None
             ]) == len(self.rewards) and len(self.rewards) > 0
              and self.time_differences[0] is not None))
            ) or (
                not 'LOCAL_T_MAX' in globals() and
                (len(self.rewards) >= math.floor(
                    A3CTrainingThread.get_actual_window(self.windows[0] +
                                                        self.actions[0])) or
                 (len([
                     item for item in self.actions[:math.floor(
                         A3CTrainingThread.get_actual_window(self.windows[0] +
                                                             self.actions[0]))]
                     if item is not None
                 ]) == len(self.rewards) and len(self.rewards) > 0
                  and self.time_differences[0] is not None))):
            if not 'LOCAL_T_MAX' in globals():
                assert (len(self.rewards) == math.floor(
                    A3CTrainingThread.get_actual_window(self.windows[0] +
                                                        self.actions[0])
                ) or (len([
                    item for item in self.actions[:math.floor(
                        A3CTrainingThread.get_actual_window(self.windows[0] +
                                                            self.actions[0]))]
                    if item is not None
                ]) == len(self.rewards) and len(self.rewards) > 0
                      and self.time_differences[0] is not None))
            # print(self.thread_index, "rewards", self.rewards, "actions", self.actions, "time_diffs", self.time_differences)
            # assert(len(self.rewards) <= LOCAL_T_MAX)
            # print(len([item for item in self.actions[:LOCAL_T_MAX] if item is not None]), len(self.rewards[:LOCAL_T_MAX]))

            # assert(len([item for item in self.actions[:LOCAL_T_MAX] if item is not None]) == len(self.rewards[:LOCAL_T_MAX]))
            if not len([
                    item for item in self.actions[:len(self.rewards)]
                    if item is not None
            ]) == len(self.rewards[:len(self.rewards)]):
                print("actions", self.actions, "rewards", self.rewards)
            assert (len([
                item for item in self.actions[:len(self.rewards)]
                if item is not None
            ]) == len(self.rewards[:len(self.rewards)]))
            result = self.process(sess, global_t, summary_writer, summary_op,
                                  summary_inputs, self.time_differences[0])
            return result
        else:
            return 0

    def final_step(self, sess, global_t, summary_writer, summary_op,
                   summary_inputs, actions_to_remove, time_difference, window):
        # print(self.thread_index, "self.time_differences", self.time_differences)
        # print("self.actions", len(self.actions))
        # print("self.states", len(self.states))
        # print("self.values", len(self.values))
        # print("self.rewards", len(self.rewards))
        # print("self.estimated_values", len(self.estimated_values))
        # print("self.time_differences", len(self.time_differences))
        # print("self.start_lstm_states", len(self.start_lstm_states))
        # print("self.variable_snapshots", len(self.variable_snapshots))
        # self.actions = self.actions[:-actions_to_remove]
        # self.states = self.states[:-actions_to_remove]
        # self.values = self.values[:-actions_to_remove]
        # self.estimated_values = self.estimated_values[:-actions_to_remove+1] # Sure that you have to remove one less?
        # print("Final step is called")

        if self.training:
            if len(self.actions) > 0:
                self.time_differences = self.time_differences[:-1]
                self.time_differences.append(time_difference)
                # self.windows = self.windows[:-1]
                # self.windows.append(window)

                if 'LOCAL_T_MAX' in globals():
                    nones_to_add = [None] * (
                        (LOCAL_T_MAX -
                         (len(self.actions) % LOCAL_T_MAX)) % LOCAL_T_MAX)
                else:
                    # Sure this makes sense?
                    nones_to_add = [None] * (self.start_anew_index() -
                                             len(self.actions))
                self.actions += nones_to_add
                self.states += nones_to_add
                self.values += nones_to_add
                self.estimated_values += nones_to_add
                self.windows += nones_to_add
                self.ticknos += nones_to_add
            # TODO: Is this useful? I guess only the `local_t' is actually needed...
            else:
                self.local_t = 0
                self.episode_count += 1
                self.episode_reward_throughput = 0
                self.episode_reward_delay = 0
                self.episode_reward_sent = 0

        # If, for some strange reason, absolutely nothing happened in this episode, don't do anything...
        # Or if you're actually in testing mode :)
        # if len(self.rewards)>0:
        #   time_diff = self.process(sess, global_t, summary_writer, summary_op, summary_inputs, time_difference)
        # else:
        #   time_diff = 0

        # self.states = []
        # self.actions = []
        # self.rewards = []
        # self.values = []
        # self.estimated_values = []
        # self.start_lstm_states = []
        # self.variable_snapshots = []
        # FIXME: Not resetting state any longer!!! Is that bad?
        sess.run(self.sync)
        self.reset_state_and_reinitialize(sess)

    @staticmethod
    def get_actual_window(x):
        return max(min(x, MAX_WINDOW), MIN_WINDOW)

    def process(self,
                sess,
                global_t,
                summary_writer,
                summary_op,
                summary_inputs,
                time_difference=None):
        # print(self.thread_index, "in process")
        assert (len(self.rewards) > 0)

        # print(len(self.rewards))

        if not len(self.start_lstm_states) <= len(self.actions):
            print(len(self.start_lstm_states), len(self.actions))
        assert (len(self.start_lstm_states) <= len(self.actions))

        if self.local_t <= 0:
            self.start_time = time.time()

        final = time_difference is not None

        start_local_t = self.local_t

        # logging.debug(" ".join(map(str,(self.thread_index, "In process: len(rewards)", len(self.rewards), "len(states)", len(self.states), "len(actions)", len(self.actions), "len(values)", len(self.values)))))

        if 'LOCAL_T_MAX' in globals():
            actions = self.actions[:LOCAL_T_MAX]
            ticknos = self.ticknos[:LOCAL_T_MAX]
            windows = self.windows[:LOCAL_T_MAX]
            states = self.states[:LOCAL_T_MAX]
            rewards = self.rewards[:LOCAL_T_MAX]
            values = self.values[:LOCAL_T_MAX]
        else:
            # actions = self.actions[:len(self.rewards)]
            # ticknos = self.ticknos[:len(self.rewards)]
            # windows = self.windows[:len(self.rewards)]
            # states = self.states[:len(self.rewards)]
            # values = self.values[:len(self.rewards)]
            # rewards = self.rewards[:len(self.rewards)]

            actions = self.actions[:math.floor(
                A3CTrainingThread.get_actual_window(self.windows[0] +
                                                    self.actions[0]))]
            ticknos = self.ticknos[:math.floor(
                A3CTrainingThread.get_actual_window(self.windows[0] +
                                                    self.actions[0]))]
            windows = self.windows[:math.floor(
                A3CTrainingThread.get_actual_window(self.windows[0] +
                                                    self.actions[0]))]
            states = self.states[:math.floor(
                A3CTrainingThread.get_actual_window(self.windows[0] +
                                                    self.actions[0]))]
            values = self.values[:math.floor(
                A3CTrainingThread.get_actual_window(self.windows[0] +
                                                    self.actions[0]))]
            rewards = self.rewards[:math.floor(
                A3CTrainingThread.get_actual_window(self.windows[0] +
                                                    self.actions[0]))]

        actions = [item for item in actions if item is not None]
        ticknos = [item for item in ticknos if item is not None]
        windows = [item for item in windows if item is not None]
        states = [item for item in states if item is not None]
        rewards = [item for item in rewards if item is not None]
        values = [item for item in values if item is not None]

        assert (len(actions) > 0)
        assert (len(ticknos) > 0)
        assert (len(windows) > 0)
        assert (len(states) > 0)
        assert (len(rewards) > 0)
        assert (len(values) > 0)
        if not (len(actions) == len(ticknos) == len(windows) == len(states) ==
                len(rewards) == len(values)):
            print(len(self.actions), len(self.ticknos), len(self.windows),
                  len(self.states), len(self.rewards), len(self.values))
            print(self.actions, self.ticknos, self.windows, self.states,
                  self.rewards, self.values)
            print(len(actions), len(ticknos), len(windows), len(states),
                  len(rewards), len(values))
            print(actions, ticknos, windows, states, rewards, values)
        assert (len(actions) == len(ticknos) == len(windows) == len(states) ==
                len(rewards) == len(values))

        # if not len(self.actions) == len(self.ticknos) == len(self.windows) == len(self.states) == len(self.values) == len(self.estimated_values):
        #   print("In thread:", self.thread_index, "rewards:", len(self.rewards), ";", len(self.actions), len(self.ticknos), len(self.windows), len(self.states), len(self.values), len(self.estimated_values))
        # print("In thread:", self.thread_index, "rewards:", len(self.rewards), ";", len(self.actions), "lstm_states:", len(self.start_lstm_states))
        assert (len(self.actions) == len(self.ticknos) == len(self.windows) ==
                len(self.states) == len(self.values) == len(
                    self.estimated_values))
        assert (len(self.time_differences) == len(self.start_lstm_states) ==
                len(self.variable_snapshots))

        # logging.debug(" ".join(map(str,(self.thread_index, "In process: rewards", rewards, "states", states, "actions", actions, "values", values))))

        # get estimated value of step n+1
        # assert((not len(self.estimated_values) <= len(rewards)) or final)
        # print("self.estimated_values", self.estimated_values)
        # print("Spam and eggs")
        R_packets, R_duration, R_sent = self.estimated_values[len(
            rewards
        )] if len(self.estimated_values) > len(
            rewards) and self.estimated_values[len(
                rewards)] is not None and not final else self.estimated_values[
                    len(rewards) - 1]

        R_packets_initial, R_duration_initial, R_sent_initial = R_packets, R_duration, R_sent

        R_packets, R_duration, R_sent = R_packets, 1 / R_duration, R_sent
        # R_packets, R_accumulated_delay, R_duration, R_sent = (R_packets)/(1-GAMMA), (R_accumulated_delay)/(1-GAMMA), (R_duration)/(1-GAMMA), (R_sent)/(1-GAMMA)
        # logging.debug(" ".join(map(str,("exp(R_packets)", R_packets, "exp(R_accumulated_delay)", R_accumulated_delay, "exp(R_duration)", R_duration))))
        if not (R_duration > 0):
            print("R_duration", R_duration)
        if not np.isfinite(R_duration):
            R_duration = 0.0
        assert (np.isfinite(R_duration))  # Pretty dumb
        assert (np.isfinite(R_packets))
        # assert(np.isfinite(R_accumulated_delay))
        assert (np.isfinite(R_sent))

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()
        windows.reverse()
        # logging.debug(" ".join(map(str,("values", values))))

        batch_si = []
        batch_ai = []
        batch_td = []
        batch_R_duration = []
        batch_R_packets = []
        # batch_R_accumulated_delay = []
        batch_R_sent = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi, wi) in zip(actions, rewards, states, values,
                                        windows):
            # FIXME: Make sure that it actually works with how the roll-off factor gets normalized.
            # assert(False)
            # The GAMMA_FACTOR increases the influence that following observations have on this one.

            # GAMMA = (1 - 2/(A3CTrainingThread.get_actual_window(wi+ai) + 1))
            GAMMA = 0.99

            # R_duration = ((1-GAMMA)*ri[2] + GAMMA*R_duration)
            # R_packets = ((1-GAMMA)*ri[0] + GAMMA*R_packets)
            # R_sent = ((1-GAMMA)*ri[3] + GAMMA*R_sent)
            # R_accumulated_delay = ((1-GAMMA)*ri[1] + GAMMA*R_accumulated_delay)

            R_duration = ((1 - GAMMA) * ri[2] + GAMMA * R_duration)
            R_packets = ((1 - GAMMA) * ri[0] + GAMMA * R_packets)
            R_sent = ((1 - GAMMA) * ri[3] + GAMMA * R_sent)
            # R_accumulated_delay = ((1-GAMMA)*ri[1] + GAMMA*R_accumulated_delay)

            # R_delay = R_accumulated_delay/R_packets
            # td_delay = -(np.log(R_accumulated_delay/R_packets/DELAY_MULTIPLIER) - np.log(Vi[1]/Vi[0]/DELAY_MULTIPLIER))
            # td -= self.delay_delta*(R_accumulated_delay/R_packets - Vi[1]/Vi[0])
            # td_delay = -(R_accumulated_delay/R_packets - Vi[1]/Vi[0])

            # Doesn't work...
            # td = R_packets/R_duration/(R_accumulated_delay/R_packets+self.delay_delta) - Vi[0]/Vi[2]/(Vi[1]/Vi[0]+self.delay_delta)

            # td = R_packets - Vi[0] - self.delay_delta*(R_sent - Vi[3]) # - self.delay_delta*(R_accumulated_delay/R_packets - Vi[1]/Vi[0])
            # td = R_packets/R_duration - Vi[0]/Vi[2] - self.delay_delta*(R_accumulated_delay/R_packets - Vi[1]/Vi[0])
            # td = R_packets/R_duration - Vi[0]/(Vi[2]) - self.delay_delta*(R_accumulated_delay/R_packets - Vi[1]/Vi[0]) - (R_sent/R_duration - Vi[3]/(Vi[2]))
            # td = inverse_sigmoid(self.delay_delta, R_sent/(R_packets+R_sent))*R_packets/R_duration - inverse_sigmoid(self.delay_delta, Vi[3]/(Vi[0]+Vi[3]))*Vi[0]/Vi[2] - (R_sent/R_duration - Vi[3]/(Vi[2]))

            # td = inverse_sigmoid(self.delay_delta, R_sent/(R_packets+R_sent) - 0.05)*R_packets/R_duration - inverse_sigmoid(self.delay_delta, Vi[3]/(Vi[0]+Vi[3]) - 0.05)*Vi[0]/Vi[2] - (R_sent/R_duration - Vi[3]/(Vi[2])) - (R_accumulated_delay/R_packets - Vi[1]/Vi[0])

            # if environ.get('reward_type') == "PCC":
            #   # PCC
            #   td = self.inverse_sigmoid(((R_sent - R_packets)/R_sent))*R_packets/R_duration - self.inverse_sigmoid(((Vi[2]-Vi[0])/Vi[2]))*Vi[0]/(1/Vi[1]) - ((R_sent - R_packets)/R_duration - (Vi[2] - Vi[0])/(1/Vi[1])) #- (R_accumulated_delay/R_packets - Vi[1]/Vi[0])

            # if environ.get('reward_type') is None or environ.get('reward_type') == "no_cutoff":
            # PCC without cutoff
            td = R_packets / R_duration - Vi[0] / (
                1 / Vi[1]) - self.delay_delta * (
                    (R_sent - R_packets) / R_duration - (Vi[2] - Vi[0]) /
                    (1 / Vi[1])
                )  #- (R_accumulated_delay/R_packets - Vi[1]/Vi[0])

            # elif environ.get('reward_type') == "modified":
            #   # PCC modified
            #   td = (1 - (R_sent - R_packets)/R_sent)*R_packets/R_duration - (1 - (Vi[2]-Vi[0])/Vi[2])*Vi[0]/(1/Vi[1]) - ((R_sent - R_packets)/R_duration - (Vi[2] - Vi[0])/(1/Vi[1])) #- (R_accumulated_delay/R_packets - Vi[1]/Vi[0])

            # td = R_packets*(1-GAMMA)*inverse_sigmoid(SIGMOID_ALPHA * R_sent/(R_packets+R_sent) - self.delay_delta) - Vi[0]*inverse_sigmoid(SIGMOID_ALPHA * Vi[3]/(Vi[0]+Vi[3]) - self.delay_delta) - (R_sent*(1-GAMMA) - Vi[3]) - (R_accumulated_delay/R_packets - Vi[1]/Vi[0])

            # PCC modified
            # td = R_packets*inverse_sigmoid(SIGMOID_ALPHA * R_sent/(R_packets+R_sent) - self.delay_delta) - Vi[0]*inverse_sigmoid(SIGMOID_ALPHA * Vi[3]/(Vi[0]+Vi[3]) - self.delay_delta) - (R_sent - Vi[3])# - (R_accumulated_delay/R_packets - Vi[1]/Vi[0])

            # td = R_packets/R_duration/(R_accumulated_delay/R_packets) - Vi[0]/Vi[2]/(Vi[1]/Vi[0]) - (R_accumulated_delay/R_packets - Vi[1]/Vi[0])

            # td = (np.log(R_packets/R_duration) - np.log(Vi[0]/Vi[2])) - self.delay_delta*(R_accumulated_delay/R_packets - Vi[1]/Vi[0])

            # R_packets, R_accumulated_delay, R_duration, R_sent = (R_packets)/(1-GAMMA), (R_accumulated_delay)/(1-GAMMA), (R_duration)/(1-GAMMA), (R_sent)/(1-GAMMA)

            batch_si.append(si)
            batch_ai.append(ai)
            batch_td.append(td)
            batch_R_duration.append(1.0 / R_duration)
            batch_R_packets.append(R_packets)
            # batch_R_accumulated_delay.append(R_accumulated_delay)
            batch_R_sent.append(R_sent)

            # batch_R_duration.append(R_duration/(1-GAMMA))
            # batch_R_packets.append(R_packets/(1-GAMMA))
            # batch_R_accumulated_delay.append(R_accumulated_delay/(1-GAMMA))
            # batch_R_sent.append(R_sent/(1-GAMMA))

            # logging.debug(" ".join(map(str,("batch_td_throughput[-1]", batch_td_throughput[-1], "batch_td_delay[-1]", batch_td_delay[-1], "batch_R_packets[-1]", batch_R_packets[-1], "batch_R_accumulated_delay[-1]", batch_R_accumulated_delay[-1], "batch_R_duration[-1]", batch_R_duration[-1]))))

            self.episode_reward_throughput += ri[0]
            self.episode_reward_sent += ri[3]
            self.episode_reward_delay += ri[1]

        old_local_t = self.local_t
        self.local_t += len(rewards)

        # # if final or self.local_t % LOG_INTERVAL == 0:
        # print(self.thread_index, "windows", len(self.windows), "\nticknos", len(self.ticknos), "\nstates", len(self.states), "\nactions", len(self.actions), "\nrewards", len(self.rewards), "\nvalues", len(self.values), "\nestimated_values", len(self.estimated_values))
        # print(self.thread_index, "windows", self.windows, "\nticknos", self.ticknos, "\nstates", self.states, "\nactions", self.actions, "\nrewards", self.rewards, "\nvalues", self.values, "\nestimated_values", self.estimated_values)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t

        cur_learning_rate = self._anneal_learning_rate(global_t)

        # logging.info(" ".join(map(str,("All the batch stuff", "batch_si", batch_si, "batch_ai", batch_ai,"batch_R_packets", batch_R_packets, "batch_R_accumulated_delay", batch_R_accumulated_delay, "batch_R_duration", batch_R_duration))))

        self.backup_vars()

        batch_si.reverse()
        batch_ai.reverse()
        batch_td.reverse()
        batch_R_duration.reverse()
        batch_R_packets.reverse()
        # batch_R_accumulated_delay.reverse()
        batch_R_sent.reverse()
        windows.reverse()

        # print([A3CTrainingThread.get_actual_window(w+a) for w, a in zip(windows, batch_ai)])
        feed_dict = {
            self.local_network.s:
            batch_si,
            self.local_network.a:
            batch_ai,
            self.local_network.td:
            batch_td,
            self.local_network.w: [
                A3CTrainingThread.get_actual_window(w + a)
                for w, a in zip(windows, batch_ai)
            ],
            self.local_network.r_duration:
            batch_R_duration,
            self.local_network.r_packets:
            batch_R_packets,
            self.local_network.r_sent:
            batch_R_sent,
            self.local_network.initial_lstm_state_action:
            self.start_lstm_states[0][0],
            self.local_network.initial_lstm_state_value:
            self.start_lstm_states[0][1],
            self.local_network.initial_lstm_state_duration:
            self.start_lstm_states[0][2],
            self.local_network.step_size: [len(batch_ai)],
            self.learning_rate_input:
            cur_learning_rate
        }
        var_dict = dict(
            zip(self.local_network.get_vars(), self.variable_snapshots[0]))
        feed_dict.update(var_dict)

        sess.run(self.apply_gradients, feed_dict=feed_dict)

        # if len(ticknos) == 0:
        #   print(self.thread_index, "actions", self.actions, "rewards", self.rewards, "values", self.values, "estimated_values", self.estimated_values, "ticknos", self.ticknos)

        # if final or self.local_t % LOG_INTERVAL == 0:
        if final or (self.local_t >= math.floor(self.local_t / LOG_INTERVAL) *
                     LOG_INTERVAL and old_local_t <
                     math.floor(self.local_t / LOG_INTERVAL) * LOG_INTERVAL):
            # if final:
            # if ticknos[-1]-ticknos[0] > 0 and self.episode_reward_throughput > 0:
            if self.episode_reward_throughput > 0:
                # print(ticknos)
                # print(self.episode_reward_throughput, ticknos[0], ticknos[-1])
                # normalized_final_score_throughput = self.episode_reward_throughput/(ticknos[-1]-ticknos[0])
                # logging.info("{}: self.episode_reward_throughput={}, time_difference={}".format(self.thread_index, self.episode_reward_throughput, time_difference))
                normalized_final_score_delay = self.episode_reward_delay / self.episode_reward_throughput
                loss_score = (
                    self.episode_reward_sent -
                    self.episode_reward_throughput) / self.episode_reward_sent
                # print(self.windows)

                # logging.info("{}: score_throughput={}, score_delay={}, measured throughput beginning={}, measured delay beginning={}, measured throughput end={}, measured delay end={}".format(self.thread_index, normalized_final_score_throughput, normalized_final_score_delay, batch_R_packets[0]/batch_R_duration[0], batch_R_accumulated_delay[0]/batch_R_packets[0], batch_R_packets[-1]/batch_R_duration[-1], batch_R_accumulated_delay[-1]/batch_R_packets[-1]))
                # logging.info("{}: score_delay={}, measured throughput beginning={}, measured delay beginning={}, measured throughput end={}, measured delay end={} {}".format(self.thread_index, normalized_final_score_delay, batch_R_packets[0]/batch_R_duration[0], batch_R_accumulated_delay[0]/batch_R_packets[0], batch_R_packets[-1]/batch_R_duration[-1], batch_R_accumulated_delay[-1]/batch_R_packets[-1], ("final:"+str(final)+", delta:"+str(self.delay_delta)+"; "+" ".join(map(str,("R_packets", R_packets_initial, "R_accumulated_delay", R_accumulated_delay_initial, "R_duration", R_duration_initial))), "state", batch_si[0], "action", batch_ai[0][0])))
                logging.info(
                    "{}: score_delay={}, measured throughput beginning={}, {}".
                    format(self.thread_index, normalized_final_score_delay,
                           batch_R_packets[0] / (1 / batch_R_duration[0]),
                           ("final:" + str(final) + ", delta:" +
                            str(self.delay_delta) + "; " + " ".join(
                                map(str, ("R_packets", R_packets_initial,
                                          "R_duration", R_duration_initial,
                                          "R_sent", R_sent_initial))), "state",
                            batch_si[0], "action", batch_ai[0])))

                # time_difference > 0 because of a bug in Unicorn.cc that makes it possible for time_difference to be smaller than 0.

                # elapsed_time = time.time() - self.start_time
                # steps_per_sec = self.local_t / elapsed_time
                # logging.info("### {}: Performance: {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(self.thread_index, self.local_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

                # print([[A3CTrainingThread.get_actual_window(w+a) for w, a in zip(windows, batch_ai)][0]])
                feed_dict = {
                    self.local_network.s: [batch_si[0]],
                    self.local_network.a: [batch_ai[0]],
                    self.local_network.td: [batch_td[0]],
                    self.local_network.w: [[
                        A3CTrainingThread.get_actual_window(w + a)
                        for w, a in zip(windows, batch_ai)
                    ][0]],
                    self.local_network.r_duration: [batch_R_duration[0]],
                    self.local_network.r_packets: [batch_R_packets[0]],
                    self.local_network.r_sent: [batch_R_sent[0]],
                    self.local_network.initial_lstm_state_action:
                    self.start_lstm_states[0][0],
                    self.local_network.initial_lstm_state_value:
                    self.start_lstm_states[0][1],
                    self.local_network.initial_lstm_state_duration:
                    self.start_lstm_states[0][2],
                    self.local_network.step_size: [1]
                }
                feed_dict.update(var_dict)

                entropy, actor_loss, value_loss, total_loss, window_increase, std = self.local_network.run_loss(
                    sess, feed_dict)

                # print(entropy, actor_loss, value_loss, total_loss, window_increase, std)

                things = {
                    "estimated_throughput":
                    batch_R_packets[0] / (1 / batch_R_duration[0]),
                    "estimated_loss_rate":
                    (batch_R_sent[0] - batch_R_packets[0]) / batch_R_sent[0],
                    "R_duration":
                    batch_R_duration[0],
                    "R_packets":
                    batch_R_packets[0],
                    "R_sent":
                    batch_R_sent[0],
                    "v_estimated_throughput":
                    values[0][0] / (1 / values[0][1]),
                    "v_estimated_loss_rate":
                    (values[0][2] - values[0][0]) / values[0][2],
                    "v_duration":
                    values[0][1],
                    "v_packets":
                    values[0][0],
                    "v_sent":
                    values[0][2],
                    "score_delay":
                    normalized_final_score_delay,
                    "score_lost":
                    loss_score,
                    "actor_loss":
                    actor_loss.item(),
                    "value_loss":
                    value_loss,
                    "entropy":
                    entropy.item(),
                    "total_loss":
                    total_loss,
                    "window_increase":
                    window_increase.item(),
                    "window":
                    windows[0],
                    "std":
                    std.item(),
                    "lstm_state_action_mean":
                    np.mean(self.start_lstm_states[0][0]),
                    "lstm_state_action_std":
                    np.std(self.start_lstm_states[0][0]),
                    "lstm_state_value_mean":
                    np.mean(self.start_lstm_states[0][1]),
                    "lstm_state_value_std":
                    np.std(self.start_lstm_states[0][1]),
                    "lstm_state_duration_mean":
                    np.mean(self.start_lstm_states[0][2]),
                    "lstm_state_duration_std":
                    np.std(self.start_lstm_states[0][2]),
                    # "speed": steps_per_sec
                }
                # logging.debug(" ".join(map(str,("things", things))))
                self._record_score(sess, summary_writer, summary_op,
                                   summary_inputs, things, ticknos[0])

            # if final:
            self.episode_count += 1
            self.local_t = 0
            self.episode_reward_throughput = 0
            self.episode_reward_delay = 0
            self.episode_reward_sent = 0

        self.restore_backup()

        if 'LOCAL_T_MAX' in globals():
            self.actions = self.actions[LOCAL_T_MAX:]
            self.ticknos = self.ticknos[LOCAL_T_MAX:]
            self.windows = self.windows[LOCAL_T_MAX:]
            self.states = self.states[LOCAL_T_MAX:]
            self.values = self.values[LOCAL_T_MAX:]
            self.rewards = self.rewards[LOCAL_T_MAX:]
            self.estimated_values = self.estimated_values[LOCAL_T_MAX:]
        else:
            items_to_remove = math.floor(
                A3CTrainingThread.get_actual_window(self.windows[0] +
                                                    self.actions[0]))
            self.actions = self.actions[items_to_remove:]
            self.ticknos = self.ticknos[items_to_remove:]
            self.windows = self.windows[items_to_remove:]
            self.states = self.states[items_to_remove:]
            self.values = self.values[items_to_remove:]
            self.estimated_values = self.estimated_values[items_to_remove:]
            self.rewards = self.rewards[items_to_remove:]
        self.time_differences = self.time_differences[1:]
        self.start_lstm_states = self.start_lstm_states[1:]
        self.variable_snapshots = self.variable_snapshots[1:]

        if final:
            assert (len(self.rewards) <= 0)

        return diff_local_t
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device, action_size, gamma, local_t_max, entropy_beta,
                 agent_type, performance_log_interval, log_level, random_seed):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input  #每个worker不同
        self.max_global_time_step = max_global_time_step  #4000w steps

        self.action_size = action_size  #2
        self.gamma = gamma  # 0.99
        self.local_t_max = local_t_max  # 256
        self.agent_type = agent_type  #FF
        self.performance_log_interval = performance_log_interval
        self.log_level = log_level

        #初始化worker的网络
        if self.agent_type == 'LSTM':
            self.local_network = GameACLSTMNetwork(self.action_size,
                                                   thread_index, device)
        else:
            self.local_network = GameACFFNetwork(self.action_size,
                                                 thread_index, device)
        #创建一下loss的相关变量
        self.local_network.prepare_loss(entropy_beta)

        with tf.device(device):
            #获取worker网络的参数
            #[self.W_conv1, self.b_conv1, self.W_conv2, self.b_conv2,self.W_fc1, self.b_fc1,self.W_fc2, self.b_fc2,self.W_fc3, self.b_fc3]
            var_refs = []
            variables = self.local_network.get_vars()
            for v in variables:
                var_refs.append(v)
            #计算梯度,
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)
        #更新网络
        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.gradients)

        #拉取global网络参数
        self.sync = self.local_network.sync_from(global_network)

        #初始化游戏环境
        np.random.seed(random_seed)
        self.game_state = GameState(random_seed * thread_index,
                                    self.action_size)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate
        self.learn_rate = self.initial_learning_rate

        #重置一些计数器
        self.reset_counters()

        self.episode = 0

        # variable controling log output
        self.prev_local_t = 0
from rmsprop_applier import RMSPropApplier

import options
options = options.options

def choose_action(pi_values):
  pi_values -= np.finfo(np.float32).epsneg
  action_samples = np.random.multinomial(options.num_experiments, pi_values)
  return action_samples.argmax(0)


# use CPU for display tool
device = "/cpu:0"

if options.use_lstm:
  global_network = GameACLSTMNetwork(options.action_size, -1, device)
else:
  global_network = GameACFFNetwork(options.action_size, device)

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

saver = tf.train.Saver()
model_checkpoint_path = None
checkpoint = tf.train.get_checkpoint_state(options.checkpoint_dir)
if checkpoint == None:
  checkpoint = tf.train.get_checkpoint_state(os.path.dirname(options.checkpoint_dir))
  model_checkpoint_path = os.path.join(os.path.dirname(options.checkpoint_dir), os.path.basename(options.checkpoint_dir))

# for pseudo-count
def visualize(experiment_name, rmsp_alpha, rmsp_epsilon, grad_norm_clip,
              agent_type, action_size, rand_seed, checkpoint_dir):

    # use CPU for weight visualize tool
    device = "/cpu:0"

    if agent_type == 'LSTM':
        global_network = GameACLSTMNetwork(action_size, -1, device)
    else:
        global_network = GameACFFNetwork(action_size, -1, device)

    training_threads = []

    learning_rate_input = tf.placeholder("float")

    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=rmsp_alpha,
                                  momentum=0.0,
                                  epsilon=rmsp_epsilon,
                                  clip_norm=grad_norm_clip,
                                  device=device)

    game = GameState(rand_seed, action_size)
    game.process(0)
    x_t = game.x_t

    plt.imshow(x_t, interpolation="nearest", cmap=plt.cm.gray)

    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)

    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")

    W_conv1 = sess.run(global_network.W_conv1)

    # show graph of W_conv1
    fig, axes = plt.subplots(4,
                             16,
                             figsize=(12, 6),
                             subplot_kw={
                                 'xticks': [],
                                 'yticks': []
                             })
    fig.subplots_adjust(hspace=0.1, wspace=0.1)

    for ax, i in zip(axes.flat, range(4 * 16)):
        inch = i // 16
        outch = i % 16
        img = W_conv1[:, :, inch, outch]
        ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest')
        ax.set_title(str(inch) + "," + str(outch))

    plt.show()

    W_conv2 = sess.run(global_network.W_conv2)

    # show graph of W_conv2
    fig, axes = plt.subplots(2,
                             32,
                             figsize=(27, 6),
                             subplot_kw={
                                 'xticks': [],
                                 'yticks': []
                             })
    fig.subplots_adjust(hspace=0.1, wspace=0.1)

    for ax, i in zip(axes.flat, range(2 * 32)):
        inch = i // 32
        outch = i % 32
        img = W_conv2[:, :, inch, outch]
        ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest')
        ax.set_title(str(inch) + "," + str(outch))

    plt.show()

    arr = sess.run(global_network.get_vars())

    s = tf.placeholder("float", [None, 84, 84, 4])

    b_conv1 = sess.run(global_network.b_conv1)
    b_conv2 = sess.run(global_network.b_conv2)

    inp_1 = tf.nn.conv2d(s, W_conv1, strides=[1, 4, 4, 1], padding="VALID")
    h_conv1 = tf.nn.relu(inp_1 + b_conv1)

    inp_2 = tf.nn.conv2d(h_conv1,
                         W_conv2,
                         strides=[1, 2, 2, 1],
                         padding="VALID")
    h_conv2 = tf.nn.relu(inp_2 + b_conv2)

    s_t = game.s_t

    getActivations(sess, s, h_conv1, s_t, 16)
    getActivations(sess, s, h_conv2, s_t, 32)
Exemple #24
0
def run_a3c(args):
    """
    python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015

    python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<>

    python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<> --load-pretrained-model --onevsall-mtl --pretrained-model-folder=<> --use-pretrained-model-as-advice --use-pretrained-model-as-reward-shaping
    """
    from game_ac_network import GameACFFNetwork, GameACLSTMNetwork
    from a3c_training_thread import A3CTrainingThread
    if args.use_gpu:
        assert args.cuda_devices != ''
        os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
    import tensorflow as tf

    def log_uniform(lo, hi, rate):
        log_lo = math.log(lo)
        log_hi = math.log(hi)
        v = log_lo * (1 - rate) + log_hi * rate
        return math.exp(v)

    if not os.path.exists('results/a3c'):
        os.makedirs('results/a3c')

    if args.folder is not None:
        folder = 'results/a3c/{}_{}'.format(args.gym_env.replace('-', '_'),
                                            args.folder)
    else:
        folder = 'results/a3c/{}'.format(args.gym_env.replace('-', '_'))
        end_str = ''

        if args.use_mnih_2015:
            end_str += '_mnih2015'
        if args.use_lstm:
            end_str += '_lstm'
        if args.unclipped_reward:
            end_str += '_rawreward'
        elif args.log_scale_reward:
            end_str += '_logreward'
        if args.transformed_bellman:
            end_str += '_transformedbell'

        if args.use_transfer:
            end_str += '_transfer'
            if args.not_transfer_conv2:
                end_str += '_noconv2'
            elif args.not_transfer_conv3 and args.use_mnih_2015:
                end_str += '_noconv3'
            elif args.not_transfer_fc1:
                end_str += '_nofc1'
            elif args.not_transfer_fc2:
                end_str += '_nofc2'
        if args.finetune_upper_layers_only:
            end_str += '_tune_upperlayers'
        if args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs > 0:
            end_str += '_pretrain_ina3c'
        if args.use_demo_threads:
            end_str += '_demothreads'

        if args.load_pretrained_model:
            if args.use_pretrained_model_as_advice:
                end_str += '_modelasadvice'
            if args.use_pretrained_model_as_reward_shaping:
                end_str += '_modelasshaping'
        folder += end_str

    if args.append_experiment_num is not None:
        folder += '_' + args.append_experiment_num

    if False:
        from common.util import LogFormatter
        fh = logging.FileHandler('{}/a3c.log'.format(folder), mode='w')
        fh.setLevel(logging.DEBUG)
        formatter = LogFormatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        logger.addHandler(fh)

    demo_memory = None
    num_demos = 0
    max_reward = 0.
    if args.load_memory or args.load_demo_cam:
        if args.demo_memory_folder is not None:
            demo_memory_folder = args.demo_memory_folder
        else:
            demo_memory_folder = 'collected_demo/{}'.format(
                args.gym_env.replace('-', '_'))

    if args.load_memory:
        # FIXME: use new load_memory function
        demo_memory, actions_ctr, max_reward = load_memory(
            args.gym_env, demo_memory_folder,
            imgs_normalized=True)  #, create_symmetry=True)
        action_freq = [
            actions_ctr[a] for a in range(demo_memory[0].num_actions)
        ]
        num_demos = len(demo_memory)

    demo_memory_cam = None
    if args.load_demo_cam:
        demo_cam, _, total_rewards_cam, _ = load_memory(
            name=None,
            demo_memory_folder=demo_memory_folder,
            demo_ids=args.demo_cam_id,
            imgs_normalized=False)

        demo_cam = demo_cam[int(args.demo_cam_id)]
        demo_memory_cam = np.zeros((len(demo_cam), demo_cam.height,
                                    demo_cam.width, demo_cam.phi_length),
                                   dtype=np.float32)
        for i in range(len(demo_cam)):
            s0 = (demo_cam[i])[0]
            demo_memory_cam[i] = np.copy(s0)
        del demo_cam
        logger.info("loaded demo {} for testing CAM".format(args.demo_cam_id))

    device = "/cpu:0"
    gpu_options = None
    if args.use_gpu:
        device = "/gpu:" + os.environ["CUDA_VISIBLE_DEVICES"]
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=args.gpu_fraction)

    initial_learning_rate = args.initial_learn_rate
    logger.info('Initial Learning Rate={}'.format(initial_learning_rate))
    time.sleep(2)

    global_t = 0
    pretrain_global_t = 0
    pretrain_epoch = 0
    rewards = {'train': {}, 'eval': {}}
    best_model_reward = -(sys.maxsize)

    stop_requested = False

    game_state = GameState(env_id=args.gym_env)
    action_size = game_state.env.action_space.n
    game_state.close()
    del game_state.env
    del game_state

    config = tf.ConfigProto(gpu_options=gpu_options,
                            log_device_placement=False,
                            allow_soft_placement=True)

    pretrained_model = None
    pretrained_model_sess = None
    if args.load_pretrained_model:
        if args.onevsall_mtl:
            from game_class_network import MTLBinaryClassNetwork as PretrainedModelNetwork
        elif args.onevsall_mtl_linear:
            from game_class_network import MTLMultivariateNetwork as PretrainedModelNetwork
        else:
            from game_class_network import MultiClassNetwork as PretrainedModelNetwork
            logger.error("Not supported yet!")
            assert False

        if args.pretrained_model_folder is not None:
            pretrained_model_folder = args.pretrained_model_folder
        else:
            pretrained_model_folder = '{}_classifier_use_mnih_onevsall_mtl'.format(
                args.gym_env.replace('-', '_'))
        PretrainedModelNetwork.use_mnih_2015 = args.use_mnih_2015
        pretrained_model = PretrainedModelNetwork(action_size, -1, device)
        pretrained_model_sess = tf.Session(config=config,
                                           graph=pretrained_model.graph)
        pretrained_model.load(
            pretrained_model_sess,
            '{}/{}_checkpoint'.format(pretrained_model_folder,
                                      args.gym_env.replace('-', '_')))

    if args.use_lstm:
        GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015
        global_network = GameACLSTMNetwork(action_size, -1, device)
    else:
        GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015
        global_network = GameACFFNetwork(action_size, -1, device)

    training_threads = []

    learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr")

    grad_applier = tf.train.RMSPropOptimizer(learning_rate=learning_rate_input,
                                             decay=args.rmsp_alpha,
                                             epsilon=args.rmsp_epsilon)

    A3CTrainingThread.log_interval = args.log_interval
    A3CTrainingThread.performance_log_interval = args.performance_log_interval
    A3CTrainingThread.local_t_max = args.local_t_max
    A3CTrainingThread.demo_t_max = args.demo_t_max
    A3CTrainingThread.use_lstm = args.use_lstm
    A3CTrainingThread.action_size = action_size
    A3CTrainingThread.entropy_beta = args.entropy_beta
    A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta
    A3CTrainingThread.gamma = args.gamma
    A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015
    A3CTrainingThread.env_id = args.gym_env
    A3CTrainingThread.finetune_upper_layers_only = args.finetune_upper_layers_only
    A3CTrainingThread.transformed_bellman = args.transformed_bellman
    A3CTrainingThread.clip_norm = args.grad_norm_clip
    A3CTrainingThread.use_grad_cam = args.use_grad_cam

    if args.unclipped_reward:
        A3CTrainingThread.reward_type = "RAW"
    elif args.log_scale_reward:
        A3CTrainingThread.reward_type = "LOG"
    else:
        A3CTrainingThread.reward_type = "CLIP"

    n_shapers = args.parallel_size  #int(args.parallel_size * .25)
    mod = args.parallel_size // n_shapers
    for i in range(args.parallel_size):
        is_reward_shape = False
        is_advice = False
        if i % mod == 0:
            is_reward_shape = args.use_pretrained_model_as_reward_shaping
            is_advice = args.use_pretrained_model_as_advice
        training_thread = A3CTrainingThread(
            i,
            global_network,
            initial_learning_rate,
            learning_rate_input,
            grad_applier,
            args.max_time_step,
            device=device,
            pretrained_model=pretrained_model,
            pretrained_model_sess=pretrained_model_sess,
            advice=is_advice,
            reward_shaping=is_reward_shape)
        training_threads.append(training_thread)

    # prepare session
    sess = tf.Session(config=config)

    if args.use_transfer:
        if args.transfer_folder is not None:
            transfer_folder = args.transfer_folder
        else:
            transfer_folder = 'results/pretrain_models/{}'.format(
                args.gym_env.replace('-', '_'))
            end_str = ''
            if args.use_mnih_2015:
                end_str += '_mnih2015'
            end_str += '_l2beta1E-04_batchprop'  #TODO: make this an argument
            transfer_folder += end_str

        transfer_folder += '/transfer_model'

        if args.not_transfer_conv2:
            transfer_var_list = [
                global_network.W_conv1, global_network.b_conv1
            ]
        elif (args.not_transfer_conv3 and args.use_mnih_2015):
            transfer_var_list = [
                global_network.W_conv1, global_network.b_conv1,
                global_network.W_conv2, global_network.b_conv2
            ]
        elif args.not_transfer_fc1:
            transfer_var_list = [
                global_network.W_conv1,
                global_network.b_conv1,
                global_network.W_conv2,
                global_network.b_conv2,
            ]
            if args.use_mnih_2015:
                transfer_var_list += [
                    global_network.W_conv3, global_network.b_conv3
                ]
        elif args.not_transfer_fc2:
            transfer_var_list = [
                global_network.W_conv1, global_network.b_conv1,
                global_network.W_conv2, global_network.b_conv2,
                global_network.W_fc1, global_network.b_fc1
            ]
            if args.use_mnih_2015:
                transfer_var_list += [
                    global_network.W_conv3, global_network.b_conv3
                ]
        else:
            transfer_var_list = [
                global_network.W_conv1, global_network.b_conv1,
                global_network.W_conv2, global_network.b_conv2,
                global_network.W_fc1, global_network.b_fc1,
                global_network.W_fc2, global_network.b_fc2
            ]
            if args.use_mnih_2015:
                transfer_var_list += [
                    global_network.W_conv3, global_network.b_conv3
                ]

        global_network.load_transfer_model(
            sess,
            folder=transfer_folder,
            not_transfer_fc2=args.not_transfer_fc2,
            not_transfer_fc1=args.not_transfer_fc1,
            not_transfer_conv3=(args.not_transfer_conv3
                                and args.use_mnih_2015),
            not_transfer_conv2=args.not_transfer_conv2,
            var_list=transfer_var_list)

    def initialize_uninitialized(sess):
        global_vars = tf.global_variables()
        is_not_initialized = sess.run(
            [tf.is_variable_initialized(var) for var in global_vars])
        not_initialized_vars = [
            v for (v, f) in zip(global_vars, is_not_initialized) if not f
        ]

        if len(not_initialized_vars):
            sess.run(tf.variables_initializer(not_initialized_vars))

    if args.use_transfer:
        initialize_uninitialized(sess)
    else:
        sess.run(tf.global_variables_initializer())

    # summary writer for tensorboard
    summary_op = tf.summary.merge_all()
    summary_writer = tf.summary.FileWriter(
        'results/log/a3c/{}/'.format(args.gym_env.replace('-', '_')) +
        folder[12:], sess.graph)

    # init or load checkpoint with saver
    root_saver = tf.train.Saver(max_to_keep=1)
    saver = tf.train.Saver(max_to_keep=6)
    best_saver = tf.train.Saver(max_to_keep=1)
    checkpoint = tf.train.get_checkpoint_state(folder)
    if checkpoint and checkpoint.model_checkpoint_path:
        root_saver.restore(sess, checkpoint.model_checkpoint_path)
        logger.info("checkpoint loaded:{}".format(
            checkpoint.model_checkpoint_path))
        tokens = checkpoint.model_checkpoint_path.split("-")
        # set global step
        global_t = int(tokens[-1])
        logger.info(">>> global step set: {}".format(global_t))
        # set wall time
        wall_t_fname = folder + '/' + 'wall_t.' + str(global_t)
        with open(wall_t_fname, 'r') as f:
            wall_t = float(f.read())
        with open(folder + '/pretrain_global_t', 'r') as f:
            pretrain_global_t = int(f.read())
        with open(folder + '/model_best/best_model_reward',
                  'r') as f_best_model_reward:
            best_model_reward = float(f_best_model_reward.read())
        rewards = pickle.load(
            open(
                folder + '/' + args.gym_env.replace('-', '_') +
                '-a3c-rewards.pkl', 'rb'))
    else:
        logger.warning("Could not find old checkpoint")
        # set wall time
        wall_t = 0.0
        prepare_dir(folder, empty=True)
        prepare_dir(folder + '/model_checkpoints', empty=True)
        prepare_dir(folder + '/model_best', empty=True)
        prepare_dir(folder + '/frames', empty=True)

    lock = threading.Lock()
    test_lock = False
    if global_t == 0:
        test_lock = True

    last_temp_global_t = global_t
    ispretrain_markers = [False] * args.parallel_size
    num_demo_thread = 0
    ctr_demo_thread = 0

    def train_function(parallel_index):
        nonlocal global_t, pretrain_global_t, pretrain_epoch, \
            rewards, test_lock, lock, \
            last_temp_global_t, ispretrain_markers, num_demo_thread, \
            ctr_demo_thread
        training_thread = training_threads[parallel_index]

        training_thread.set_summary_writer(summary_writer)

        # set all threads as demo threads
        training_thread.is_demo_thread = args.load_memory and args.use_demo_threads
        if training_thread.is_demo_thread or args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs:
            training_thread.pretrain_init(demo_memory)

        if global_t == 0 and (
                args.train_with_demo_num_steps > 0
                or args.train_with_demo_num_epochs > 0) and parallel_index < 2:
            ispretrain_markers[parallel_index] = True
            training_thread.replay_mem_reset()

            # Pretraining with demo memory
            logger.info("t_idx={} pretrain starting".format(parallel_index))
            while ispretrain_markers[parallel_index]:
                if stop_requested:
                    return
                if pretrain_global_t > args.train_with_demo_num_steps and pretrain_epoch > args.train_with_demo_num_epochs:
                    # At end of pretraining, reset state
                    training_thread.replay_mem_reset()
                    training_thread.episode_reward = 0
                    training_thread.local_t = 0
                    if args.use_lstm:
                        training_thread.local_network.reset_state()
                    ispretrain_markers[parallel_index] = False
                    logger.info(
                        "t_idx={} pretrain ended".format(parallel_index))
                    break

                diff_pretrain_global_t, _ = training_thread.demo_process(
                    sess, pretrain_global_t)
                for _ in range(diff_pretrain_global_t):
                    pretrain_global_t += 1
                    if pretrain_global_t % 10000 == 0:
                        logger.debug(
                            "pretrain_global_t={}".format(pretrain_global_t))

                pretrain_epoch += 1
                if pretrain_epoch % 1000 == 0:
                    logger.debug("pretrain_epoch={}".format(pretrain_epoch))

            # Waits for all threads to finish pretraining
            while not stop_requested and any(ispretrain_markers):
                time.sleep(0.01)

        # Evaluate model before training
        if not stop_requested and global_t == 0:
            with lock:
                if parallel_index == 0:
                    test_reward, test_steps, test_episodes = training_threads[
                        0].testing(sess,
                                   args.eval_max_steps,
                                   global_t,
                                   folder,
                                   demo_memory_cam=demo_memory_cam)
                    rewards['eval'][global_t] = (test_reward, test_steps,
                                                 test_episodes)
                    saver.save(
                        sess,
                        folder + '/model_checkpoints/' +
                        '{}_checkpoint'.format(args.gym_env.replace('-', '_')),
                        global_step=global_t)
                    save_best_model(test_reward)
                    test_lock = False
            # all threads wait until evaluation finishes
            while not stop_requested and test_lock:
                time.sleep(0.01)

        # set start_time
        start_time = time.time() - wall_t
        training_thread.set_start_time(start_time)
        episode_end = True
        use_demo_thread = False
        while True:
            if stop_requested:
                return
            if global_t >= (args.max_time_step * args.max_time_step_fraction):
                return

            if args.use_demo_threads and global_t < args.max_steps_threads_as_demo and episode_end and num_demo_thread < 16:
                #if num_demo_thread < 2:
                demo_rate = 1.0 * (args.max_steps_threads_as_demo -
                                   global_t) / args.max_steps_threads_as_demo
                if demo_rate < 0.0333:
                    demo_rate = 0.0333

                if np.random.random() <= demo_rate and num_demo_thread < 16:
                    ctr_demo_thread += 1
                    training_thread.replay_mem_reset(D_idx=ctr_demo_thread %
                                                     num_demos)
                    num_demo_thread += 1
                    logger.info(
                        "idx={} as demo thread started ({}/16) rate={}".format(
                            parallel_index, num_demo_thread, demo_rate))
                    use_demo_thread = True

            if use_demo_thread:
                diff_global_t, episode_end = training_thread.demo_process(
                    sess, global_t)
                if episode_end:
                    num_demo_thread -= 1
                    use_demo_thread = False
                    logger.info("idx={} demo thread concluded ({}/16)".format(
                        parallel_index, num_demo_thread))
            else:
                diff_global_t, episode_end = training_thread.process(
                    sess, global_t, rewards)

            for _ in range(diff_global_t):
                global_t += 1
                if global_t % args.eval_freq == 0:
                    temp_global_t = global_t
                    lock.acquire()
                    try:
                        # catch multiple threads getting in at the same time
                        if last_temp_global_t == temp_global_t:
                            logger.info("Threading race problem averted!")
                            continue
                        test_lock = True
                        test_reward, test_steps, n_episodes = training_thread.testing(
                            sess,
                            args.eval_max_steps,
                            temp_global_t,
                            folder,
                            demo_memory_cam=demo_memory_cam)
                        rewards['eval'][temp_global_t] = (test_reward,
                                                          test_steps,
                                                          n_episodes)
                        if temp_global_t % (
                            (args.max_time_step * args.max_time_step_fraction)
                                // 5) == 0:
                            saver.save(sess,
                                       folder + '/model_checkpoints/' +
                                       '{}_checkpoint'.format(
                                           args.gym_env.replace('-', '_')),
                                       global_step=temp_global_t,
                                       write_meta_graph=False)
                        if test_reward > best_model_reward:
                            save_best_model(test_reward)
                        test_lock = False
                        last_temp_global_t = temp_global_t
                    finally:
                        lock.release()
                if global_t % (
                    (args.max_time_step * args.max_time_step_fraction) //
                        5) == 0:
                    saver.save(
                        sess,
                        folder + '/model_checkpoints/' +
                        '{}_checkpoint'.format(args.gym_env.replace('-', '_')),
                        global_step=global_t,
                        write_meta_graph=False)
                # all threads wait until evaluation finishes
                while not stop_requested and test_lock:
                    time.sleep(0.01)

    def signal_handler(signal, frame):
        nonlocal stop_requested
        logger.info('You pressed Ctrl+C!')
        stop_requested = True

        if stop_requested and global_t == 0:
            sys.exit(1)

    def save_best_model(test_reward):
        nonlocal best_model_reward
        best_model_reward = test_reward
        with open(folder + '/model_best/best_model_reward',
                  'w') as f_best_model_reward:
            f_best_model_reward.write(str(best_model_reward))
        best_saver.save(
            sess, folder + '/model_best/' +
            '{}_checkpoint'.format(args.gym_env.replace('-', '_')))

    train_threads = []
    for i in range(args.parallel_size):
        train_threads.append(
            threading.Thread(target=train_function, args=(i, )))

    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    # set start time
    start_time = time.time() - wall_t

    for t in train_threads:
        t.start()

    print('Press Ctrl+C to stop')

    for t in train_threads:
        t.join()

    logger.info('Now saving data. Please wait')

    # write wall time
    wall_t = time.time() - start_time
    wall_t_fname = folder + '/' + 'wall_t.' + str(global_t)
    with open(wall_t_fname, 'w') as f:
        f.write(str(wall_t))
    with open(folder + '/pretrain_global_t', 'w') as f:
        f.write(str(pretrain_global_t))

    root_saver.save(
        sess,
        folder + '/{}_checkpoint_a3c'.format(args.gym_env.replace('-', '_')),
        global_step=global_t)

    pickle.dump(
        rewards,
        open(
            folder + '/' + args.gym_env.replace('-', '_') + '-a3c-rewards.pkl',
            'wb'), pickle.HIGHEST_PROTOCOL)
    logger.info('Data saved!')

    sess.close()
Exemple #25
0

if not settings.mode == 'display' and not settings.mode == 'visualize':
    device = "/cpu:0"
    if settings.use_gpu:
        device = "/gpu:0"

    initial_learning_rates = log_uniform(settings.initial_alpha_low,
                                         settings.initial_alpha_high,
                                         settings.parallel_agent_size)
    global_t = 0

    stop_requested = False

    if settings.agent_type == 'LSTM':
        global_network = GameACLSTMNetwork(settings.action_size, -1, device)
    else:
        global_network = GameACFFNetwork(settings.action_size, -1, device)

    training_threads = []

    learning_rate_input = tf.placeholder("float")

    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=settings.rmsp_alpha,
                                  momentum=0.0,
                                  epsilon=settings.rmsp_epsilon,
                                  clip_norm=settings.grad_norm_clip,
                                  device=device)

    for i in range(settings.parallel_agent_size):
from game_state import GameState
from game_ac_network import GameACFFNetwork, GameACLSTMNetwork, GameACDilatedNetwork

from constants import ACTION_SIZE
from constants import PARALLEL_SIZE
from constants import CHECKPOINT_DIR
from constants import USE_GPU
from constants import NETWORK_TYPE

from constants import TESTING_DAYS

# use CPU for display tool
device = "/cpu:0"

if NETWORK_TYPE == 'LSTM':
    global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device)
elif NETWORK_TYPE == 'DILATED':
    global_network = GameACDilatedNetwork(ACTION_SIZE, device)
elif NETWORK_TYPE == 'CONV':
    global_network = GameACFFNetwork(ACTION_SIZE, device)
else:
    raise SystemExit('NETWORK_TYPE must be LSTM, CONV or DILATED.')

sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)

saver = tf.train.Saver()
checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)
if checkpoint and checkpoint.model_checkpoint_path:
  saver.restore(sess, checkpoint.model_checkpoint_path)
    def __init__(self,
                 thread_index,
                 global_network,
                 pinitial_learning_rate,
                 plearning_rate_input,
                 pgrad_applier,
                 vinitial_learning_rate,
                 vlearning_rate_input,
                 vgrad_applier,
                 max_global_time_step,
                 device,
                 task_index=""):

        self.thread_index = thread_index
        self.plearning_rate_input = plearning_rate_input
        self.vlearning_rate_input = vlearning_rate_input
        self.max_global_time_step = max_global_time_step
        self.game_state = GameState()
        state = self.game_state.reset()
        self.game_state.reset_gs(state)
        self.action_size = self.game_state.action_size
        self.state_size = self.game_state.state_size
        self.local_max_iter = self.game_state.local_max_iter

        if USE_LSTM:
            self.local_network = GameACLSTMNetwork(self.action_size,
                                                   self.state_size,
                                                   self.game_state.action_low,
                                                   self.game_state.action_high,
                                                   thread_index, device)
        else:
            self.local_network = GameACFFNetwork(self.action_size,
                                                 self.state_size,
                                                 self.game_state.action_low,
                                                 self.game_state.action_high,
                                                 thread_index, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            pvar_refs = [v._ref() for v in self.local_network.get_pvars()]
            self.policy_gradients = tf.gradients(
                self.local_network.policy_loss,
                pvar_refs,
                gate_gradients=False,
                aggregation_method=None,
                colocate_gradients_with_ops=False)
            vvar_refs = [v._ref() for v in self.local_network.get_vvars()]
            self.value_gradients = tf.gradients(
                self.local_network.value_loss,
                vvar_refs,
                gate_gradients=False,
                aggregation_method=None,
                colocate_gradients_with_ops=False)

        self.apply_policy_gradients = pgrad_applier.apply_gradients(
            self.local_network.get_pvars(), self.policy_gradients)
        self.apply_value_gradients = vgrad_applier.apply_gradients(
            self.local_network.get_vvars(), self.value_gradients)

        self.local_t = 0

        self.pinitial_learning_rate = pinitial_learning_rate
        self.vinitial_learning_rate = vinitial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0
Exemple #28
0
from constants import ACTION_SIZE
from constants import PARALLEL_SIZE
from constants import MAX_TIME_STEP
from constants import CHECKPOINT_DIR
from constants import RMSP_EPSILON
from constants import RMSP_ALPHA
from constants import GRAD_NORM_CLIP
from constants import USE_GPU
from constants import USE_LSTM

# use CPU for weight visualize tool
device = "/cpu:0"

if USE_LSTM:
  global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device)
else:
  global_network = GameACFFNetwork(ACTION_SIZE, -1, device)

training_threads = []

learning_rate_input = tf.placeholder(PRECISION)

grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                              decay = RMSP_ALPHA,
                              momentum = 0.0,
                              epsilon = RMSP_EPSILON,
                              clip_norm = GRAD_NORM_CLIP,
                              device = device)

sess = tf.Session()
Exemple #29
0
                      [str(i.name)
                       for i in not_initialized_vars]))))  # only for testing
        if len(not_initialized_vars) > 0:
            sess.run(tf.variables_initializer(not_initialized_vars))


# initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW,
#                                     INITIAL_ALPHA_HIGH,
#                                     INITIAL_ALPHA_LOG_RATE)

initial_learning_rate = INITIAL_RATE

global_t = 0

if cooperative:
    global_network = GameACLSTMNetwork(0, device)
else:
    num_threads = int(environ.get('num_threads'))
    logging.info(" ".join(map(str, ("num_threads", num_threads))))

    assert (num_threads is not None)
    global_network = []
    for i in range(1, int(num_threads) + 1):
        global_network.append(GameACLSTMNetwork(-i, device))

learning_rate_input = tf.placeholder(PRECISION)

# grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
#                               decay = RMSP_ALPHA,
#                               momentum = 0.0,
#                               epsilon = RMSP_EPSILON,