Esempio n. 1
0
    def _fill_experience(self, sess):
        """
    Fill experience buffer until buffer is full.
    """
        prev_state = self.environment.last_state
        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)

        pi_, _ = self.local_network.run_base_policy_and_value(
            sess, self.environment.last_state, last_action_reward)
        action = self.choose_action(pi_)

        new_state, reward, terminal, pixel_change = self.environment.process(
            action)

        #print('action:', action, terminal)

        frame = ExperienceFrame(prev_state, reward, action, terminal,
                                pixel_change, last_action, last_reward)
        self.experience.add_frame(frame)

        if terminal:
            self.environment.reset()
        if self.experience.is_full():
            self.environment.reset()
            print("Replay buffer filled")
Esempio n. 2
0
    def fill_experience(self):
        prev_state = self.env.last_state
        last_action = self.env.last_action
        last_reward = self.env.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)
        with torch.no_grad():
            state = torch.from_numpy(self.env.last_state).unsqueeze(0)
            lar = torch.from_numpy(last_action_reward).unsqueeze(0)
            _, pi, (self.hx, self.cx) = self.model(task_type='a3c',
                                                   states=state,
                                                   hx=self.hx,
                                                   cx=self.cx,
                                                   last_action_rewards=lar)

            action_index = pi.max(1)[1].view(1, 1).item()

        new_state, reward, terminal, pixel_change = self.env.step(
            action_index)  # 存储为数组

        frame = ExperienceFrame(prev_state, reward, action_index, terminal,
                                pixel_change, last_action, last_reward)
        self.memory.add_frame(frame)

        if terminal:
            self.env.reset()
        if self.memory.is_full():
            self.env.reset()
            print("Replay buffer filled")
        self.done = terminal
Esempio n. 3
0
    def action_test(self):
        with torch.no_grad():
            self.update_lstm_state()
            state = torch.from_numpy(self.env.last_state).unsqueeze(0)

            last_action = self.env.last_action
            last_reward = np.clip(self.env.last_reward, -1, 1)
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)
            lar = torch.from_numpy(last_action_reward)

            v, pi, (self.hx,
                    self.cx) = self.model(task_type='a3c',
                                          states=state,
                                          hx=self.hx,
                                          cx=self.cx,
                                          last_action_rewards=lar.unsqueeze(0))
        prob = F.softmax(pi, dim=1)
        action = prob.max(1)[1].data.cpu().numpy()
        state, self.reward, self.done, pixel_change = self.env.step(action[0])
        self.info = 5
        self.state = torch.from_numpy(state).float()
        if self.gpu_id >= 0:
            with torch.cuda.device(self.gpu_id):
                self.state = self.state.cuda()
        self.eps_len += 1
        return self
Esempio n. 4
0
    def fill_experience(self):
        prev_state = self.env.last_state
        last_action = self.env.last_action
        last_reward = self.env.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)
        with torch.no_grad():
            state = torch.from_numpy(self.env.last_state['rgb']).unsqueeze(0)
            lar = torch.from_numpy(last_action_reward).unsqueeze(0)
            # whether to gpu
            if self.gpu_id >= 0:
                with torch.cuda.device(self.gpu_id):
                    state = state.cuda()
                    lar = lar.cuda()

            _, logits, (self.hx, self.cx) = self.model(task_type='a3c',
                                                       states=state,
                                                       hx=self.hx,
                                                       cx=self.cx,
                                                       last_action_reward=lar)

            action_index = self.choose_action(
                pi_values=F.softmax(logits, 1).cpu().numpy()[0])

            obs, reward, terminal, _ = self.env.step(action_index)  # 存储为数组

            frame = ExperienceFrame(prev_state['rgb'], reward, action_index,
                                    terminal, obs['pixel_change'], last_action,
                                    last_reward)
            self.replay_buffer.add_frame(frame)

            if terminal:
                self.env.reset()
            else:
                # 更新 LSTM 状态
                self.hx = self.hx.detach()
                self.cx = self.cx.detach()
            if self.replay_buffer.is_full():
                self.env.reset()
                print("Replay buffer filled")
Esempio n. 5
0
    def a3c_process(self):
        """
        在 on-policy 下运行程序
        :return:
        """
        states = []
        last_action_rewards = []
        actions = []  #
        rewards = []
        values = []  # V
        actions_prob = []

        terminal_end = False

        # t_max times loop
        for _ in range(self.args.num_steps):
            # Prepare last action reward
            last_action = self.env.last_action
            last_reward = self.env.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)
            state = torch.from_numpy(self.env.last_state).unsqueeze(0)
            lar = torch.from_numpy(last_action_reward)

            v, pi, (self.hx,
                    self.cx) = self.model(task_type='a3c',
                                          states=state,
                                          hx=self.hx,
                                          cx=self.cx,
                                          last_action_rewards=lar.unsqueeze(0))

            action_index = pi.max(1)[1].view(1, 1).item()

            states.append(torch.from_numpy(self.env.last_state))
            actions_prob.append(torch.squeeze(pi, dim=0))
            last_action_rewards.append(lar)
            actions.append(action_index)
            values.append(v)

            prev_state = self.env.last_state

            new_state, reward, terminal, pixel_change = self.env.step(
                action_index)
            frame = ExperienceFrame(prev_state, reward, action_index, terminal,
                                    pixel_change, last_action, last_reward)

            # Store to experience
            self.memory.add_frame(frame)

            # self.episode_reward += reward

            rewards.append(reward)

            self.update_lstm_state()
            if terminal:
                self.env.reset()
                break

        R = torch.zeros(1, 1)
        if not terminal_end:
            state = torch.from_numpy(new_state).unsqueeze(0)
            lar = torch.from_numpy(frame.get_action_reward(
                self.action_size)).unsqueeze(0)
            value, _, _ = self.model(task_type='a3c',
                                     states=state,
                                     hx=self.hx,
                                     cx=self.cx,
                                     last_action_rewards=lar)
            R = value.data
        # 构造误差项
        actions.reverse()
        rewards.reverse()
        values.reverse()

        batch_a = []
        batch_adv = []
        batch_R = []

        for (ai, ri, Vi) in zip(actions, rewards, values):
            R = ri + self.args.gamma * R
            adv = R - Vi
            a = np.zeros([self.action_size], dtype=np.float32)
            a[ai] = 1.0

            batch_a.append(torch.from_numpy(a))
            batch_adv.append(adv)
            batch_R.append(R)

        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()
        # 转换为张量

        return batch_a, batch_adv, batch_R, last_action_rewards, states, actions_prob, values
Esempio n. 6
0
    def _process_base(self, sess, global_t, summary_writer, summary_op,
                      score_input):
        # [Base A3C]
        states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        start_lstm_state = self.local_network.base_lstm_state_out

        # t_max times loop
        for _ in range(self.local_t_max):
            # Prepare last action reward
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)

            pi_, value_ = self.local_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward)

            action = self.choose_action(pi_)

            states.append(self.environment.last_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            prev_state = self.environment.last_state

            # Process game
            new_state, reward, terminal, pixel_change = self.environment.process(
                action)
            frame = ExperienceFrame(prev_state, reward, action, terminal,
                                    pixel_change, last_action, last_reward)

            # Store to experience
            self.experience.add_frame(frame)

            self.episode_reward += reward

            rewards.append(reward)

            self.local_t += 1

            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward, global_t)

                self.episode_reward = 0
                self.environment.reset()
                self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_base_value(
                sess, new_state, frame.get_action_reward(self.action_size))

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_adv = []
        batch_R = []

        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + self.gamma * R
            adv = R - Vi
            a = np.zeros([self.action_size])
            a[ai] = 1.0

            batch_si.append(si)
            batch_a.append(a)
            batch_adv.append(adv)
            batch_R.append(R)

        batch_si.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()

        return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state
Esempio n. 7
0
    def process_a3c(self):
        rewards = []
        log_probs = []  # 指定的行为 概率
        entropies = []
        values = []
        action_one_hot = []
        # adv = []  # GAE, 采用 advantage 函数
        terminal_end = False  # 结束采样的时候是否是终止状态
        episode_score = None  # 决定是否显示 episodic score
        for t in range(20):
            state = torch.from_numpy(self.env.last_state['rgb']).unsqueeze(
                dim=0)  # batch = 1

            last_action = self.env.last_action
            last_reward = self.env.last_reward
            last_action_reward = torch.from_numpy(
                ExperienceFrame.concat_action_and_reward(
                    last_action, self.action_size, last_reward)).unsqueeze(0)
            # whether to gpu
            if self.gpu_id >= 0:
                with torch.cuda.device(self.gpu_id):
                    state = state.cuda()
                    last_action_reward = last_action_reward.cuda()

            value, logits, (self.hx, self.cx) = self.model(
                'a3c',
                state,
                hx=self.hx,
                cx=self.cx,
                last_action_reward=last_action_reward)
            prob = F.softmax(logits, dim=1)  # batch, 6.
            log_prob = torch.log(
                prob.clamp(1e-20, 1.0)
            )  # F.log_softmax(logits, dim=1).clamp(1e-20, 1.0)  # batch, 6. NaN
            #
            entropy = -(log_prob * prob).sum(1)
            # 采取行为
            with torch.no_grad():
                action_index = self.choose_action(
                    pi_values=F.softmax(logits, 1).cpu().numpy()[0])

            prev_state = self.env.last_state['rgb']

            observation, reward, terminal, _ = self.env.step(action_index)

            # 显示信息
            if self.rank == 0 and self.local_t % 100 == 0:
                print("pi={}".format(prob.detach().cpu().numpy()))
                print(" V={}".format(value.detach().cpu().numpy()))

            self.local_t += 1
            # 添加到 replay buffer
            frame = ExperienceFrame(prev_state, reward, action_index, terminal,
                                    observation['pixel_change'], last_action,
                                    last_reward)

            # Store to experience
            self.replay_buffer.add_frame(frame)

            entropies.append(entropy)
            values.append(value)
            rewards.append(reward)
            log_probs.append(log_prob)

            a = torch.zeros(self.action_size, dtype=torch.float32)
            # a = np.zeros([self.action_size], dtype=np.float32)
            a[action_index] = 1.0
            if self.gpu_id >= 0:
                with torch.cuda.device(self.gpu_id):
                    a = a.cuda()
            action_one_hot.append(a)

            self.episodic_score += reward
            if terminal:
                print('Score: {0}'.format(self.episodic_score))
                episode_score = self.episodic_score
                self.episodic_score = 0
                terminal_end = True
                self.env.reset()
                self.reset()
                break
            else:
                self.hx = self.hx.detach()
                self.cx = self.cx.detach()
        # 计算 R
        R = torch.zeros(1, 1)
        if not terminal_end:
            with torch.no_grad():
                # 这里进行 bootstrapping
                state = torch.from_numpy(observation['rgb']).unsqueeze(0)
                lar = torch.from_numpy(
                    frame.get_action_reward(self.action_size)).unsqueeze(0)

                # whether to gpu
                if self.gpu_id >= 0:
                    with torch.cuda.device(self.gpu_id):
                        state = state.cuda()
                        lar = lar.cuda()

                value, _, (_, _) = self.model(task_type='a3c',
                                              states=state,
                                              hx=self.hx,
                                              cx=self.cx,
                                              last_action_reward=lar)
                R = value.detach(
                )  # 这个值为 V(s_t,\theta_v^'), 在计算 actor 的梯度的时候可能会算入, 所以 detach
        if self.gpu_id >= 0:
            with torch.cuda.device(self.gpu_id):
                R = R.cuda()
        # values.append(R)  # 用于 bootstrapping
        # 准备计算 loss
        policy_loss = 0
        value_loss = 0
        # 反向计算 loss
        for i in reversed(range(len(rewards))):  # i=t - 1,...,t_start
            R = self.args.gamma * R + rewards[i]  # R <- r_t + \gamma * R
            adv = R - values[
                i]  # GAE = advantage, R - V(s_i;\theta^'_v), adv 在反向传播的时候是 \theta_v的梯度, 在 policy 的梯度需要 detach
            value_loss += 0.5 * self.l2_loss(
                R, values[i])  # 定义为 0.5 * MSELOSS(R - value), 学习率是 actor 一半
            log_prob_a = (log_probs[i] * action_one_hot[i]).sum(
                1)  # log(a_i|s_i;\theta^')
            policy_loss += -log_prob_a * adv.detach(
            ) + entropies[i] * 0.001  # entropy_beta
        return value_loss + policy_loss, episode_score
Esempio n. 8
0
  def _process_a3c(self, sess):
    states = []
    last_action_rewards = []
    actions = []
    rewards = []
    values = []
    terminal_end = False
    start_lstm_state = self.base_lstm_state_out
    for _ in range(LOCAL_T_MAX):
      last_action = self.env.last_action
      last_reward = self.env.last_reward
      last_action_reward = ExperienceFrame.concat_action_and_reward(last_action,
                                                                    self.action_n,
                                                                    last_reward)
      pi_, value_ = self.run_base_policy_and_value(sess,
                                                                self.env.last_state,
                                                                last_action_reward)
      action = choose_action(pi_)
      states.append(self.env.last_state)
      last_action_rewards.append(last_action_reward)
      actions.append(action)
      values.append(value_)

      prev_state = self.env.last_state

      # Process game
      new_state, reward, terminal, pixel_change = self.env.process(action)
      frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change,
                              last_action, last_reward)

      # Store to experience
      self.experience.add_frame(frame)

      self.episode_reward += reward

      rewards.append(reward)

      if terminal:
        terminal_end = True
        print("score={}".format(self.episode_reward))
          
        self.episode_reward = 0
        self.env.reset()
        self.reset_state()
        break

    R = 0.0
    if not terminal_end:
      R = self.run_base_value(sess, new_state, frame.get_last_action_reward(self.action_n))

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    batch_si = []
    batch_a = []
    batch_adv = []
    batch_R = []

    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      adv = R - Vi
      a = np.zeros([self.action_n])
      a[ai] = 1.0

      batch_si.append(si)
      batch_a.append(a)
      batch_adv.append(adv)
      batch_R.append(R)

    batch_si.reverse()
    batch_a.reverse()
    batch_adv.reverse()
    batch_R.reverse()
    
    return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state