def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action) #print('action:', action, terminal) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled")
def fill_experience(self): prev_state = self.env.last_state last_action = self.env.last_action last_reward = self.env.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) with torch.no_grad(): state = torch.from_numpy(self.env.last_state).unsqueeze(0) lar = torch.from_numpy(last_action_reward).unsqueeze(0) _, pi, (self.hx, self.cx) = self.model(task_type='a3c', states=state, hx=self.hx, cx=self.cx, last_action_rewards=lar) action_index = pi.max(1)[1].view(1, 1).item() new_state, reward, terminal, pixel_change = self.env.step( action_index) # 存储为数组 frame = ExperienceFrame(prev_state, reward, action_index, terminal, pixel_change, last_action, last_reward) self.memory.add_frame(frame) if terminal: self.env.reset() if self.memory.is_full(): self.env.reset() print("Replay buffer filled") self.done = terminal
def action_test(self): with torch.no_grad(): self.update_lstm_state() state = torch.from_numpy(self.env.last_state).unsqueeze(0) last_action = self.env.last_action last_reward = np.clip(self.env.last_reward, -1, 1) last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) lar = torch.from_numpy(last_action_reward) v, pi, (self.hx, self.cx) = self.model(task_type='a3c', states=state, hx=self.hx, cx=self.cx, last_action_rewards=lar.unsqueeze(0)) prob = F.softmax(pi, dim=1) action = prob.max(1)[1].data.cpu().numpy() state, self.reward, self.done, pixel_change = self.env.step(action[0]) self.info = 5 self.state = torch.from_numpy(state).float() if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.state = self.state.cuda() self.eps_len += 1 return self
def fill_experience(self): prev_state = self.env.last_state last_action = self.env.last_action last_reward = self.env.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) with torch.no_grad(): state = torch.from_numpy(self.env.last_state['rgb']).unsqueeze(0) lar = torch.from_numpy(last_action_reward).unsqueeze(0) # whether to gpu if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): state = state.cuda() lar = lar.cuda() _, logits, (self.hx, self.cx) = self.model(task_type='a3c', states=state, hx=self.hx, cx=self.cx, last_action_reward=lar) action_index = self.choose_action( pi_values=F.softmax(logits, 1).cpu().numpy()[0]) obs, reward, terminal, _ = self.env.step(action_index) # 存储为数组 frame = ExperienceFrame(prev_state['rgb'], reward, action_index, terminal, obs['pixel_change'], last_action, last_reward) self.replay_buffer.add_frame(frame) if terminal: self.env.reset() else: # 更新 LSTM 状态 self.hx = self.hx.detach() self.cx = self.cx.detach() if self.replay_buffer.is_full(): self.env.reset() print("Replay buffer filled")
def a3c_process(self): """ 在 on-policy 下运行程序 :return: """ states = [] last_action_rewards = [] actions = [] # rewards = [] values = [] # V actions_prob = [] terminal_end = False # t_max times loop for _ in range(self.args.num_steps): # Prepare last action reward last_action = self.env.last_action last_reward = self.env.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) state = torch.from_numpy(self.env.last_state).unsqueeze(0) lar = torch.from_numpy(last_action_reward) v, pi, (self.hx, self.cx) = self.model(task_type='a3c', states=state, hx=self.hx, cx=self.cx, last_action_rewards=lar.unsqueeze(0)) action_index = pi.max(1)[1].view(1, 1).item() states.append(torch.from_numpy(self.env.last_state)) actions_prob.append(torch.squeeze(pi, dim=0)) last_action_rewards.append(lar) actions.append(action_index) values.append(v) prev_state = self.env.last_state new_state, reward, terminal, pixel_change = self.env.step( action_index) frame = ExperienceFrame(prev_state, reward, action_index, terminal, pixel_change, last_action, last_reward) # Store to experience self.memory.add_frame(frame) # self.episode_reward += reward rewards.append(reward) self.update_lstm_state() if terminal: self.env.reset() break R = torch.zeros(1, 1) if not terminal_end: state = torch.from_numpy(new_state).unsqueeze(0) lar = torch.from_numpy(frame.get_action_reward( self.action_size)).unsqueeze(0) value, _, _ = self.model(task_type='a3c', states=state, hx=self.hx, cx=self.cx, last_action_rewards=lar) R = value.data # 构造误差项 actions.reverse() rewards.reverse() values.reverse() batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, Vi) in zip(actions, rewards, values): R = ri + self.args.gamma * R adv = R - Vi a = np.zeros([self.action_size], dtype=np.float32) a[ai] = 1.0 batch_a.append(torch.from_numpy(a)) batch_adv.append(adv) batch_R.append(R) batch_a.reverse() batch_adv.reverse() batch_R.reverse() # 转换为张量 return batch_a, batch_adv, batch_R, last_action_rewards, states, actions_prob, values
def _process_base(self, sess, global_t, summary_writer, summary_op, score_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = self.local_network.base_lstm_state_out # t_max times loop for _ in range(self.local_t_max): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, value_ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state
def process_a3c(self): rewards = [] log_probs = [] # 指定的行为 概率 entropies = [] values = [] action_one_hot = [] # adv = [] # GAE, 采用 advantage 函数 terminal_end = False # 结束采样的时候是否是终止状态 episode_score = None # 决定是否显示 episodic score for t in range(20): state = torch.from_numpy(self.env.last_state['rgb']).unsqueeze( dim=0) # batch = 1 last_action = self.env.last_action last_reward = self.env.last_reward last_action_reward = torch.from_numpy( ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward)).unsqueeze(0) # whether to gpu if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): state = state.cuda() last_action_reward = last_action_reward.cuda() value, logits, (self.hx, self.cx) = self.model( 'a3c', state, hx=self.hx, cx=self.cx, last_action_reward=last_action_reward) prob = F.softmax(logits, dim=1) # batch, 6. log_prob = torch.log( prob.clamp(1e-20, 1.0) ) # F.log_softmax(logits, dim=1).clamp(1e-20, 1.0) # batch, 6. NaN # entropy = -(log_prob * prob).sum(1) # 采取行为 with torch.no_grad(): action_index = self.choose_action( pi_values=F.softmax(logits, 1).cpu().numpy()[0]) prev_state = self.env.last_state['rgb'] observation, reward, terminal, _ = self.env.step(action_index) # 显示信息 if self.rank == 0 and self.local_t % 100 == 0: print("pi={}".format(prob.detach().cpu().numpy())) print(" V={}".format(value.detach().cpu().numpy())) self.local_t += 1 # 添加到 replay buffer frame = ExperienceFrame(prev_state, reward, action_index, terminal, observation['pixel_change'], last_action, last_reward) # Store to experience self.replay_buffer.add_frame(frame) entropies.append(entropy) values.append(value) rewards.append(reward) log_probs.append(log_prob) a = torch.zeros(self.action_size, dtype=torch.float32) # a = np.zeros([self.action_size], dtype=np.float32) a[action_index] = 1.0 if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): a = a.cuda() action_one_hot.append(a) self.episodic_score += reward if terminal: print('Score: {0}'.format(self.episodic_score)) episode_score = self.episodic_score self.episodic_score = 0 terminal_end = True self.env.reset() self.reset() break else: self.hx = self.hx.detach() self.cx = self.cx.detach() # 计算 R R = torch.zeros(1, 1) if not terminal_end: with torch.no_grad(): # 这里进行 bootstrapping state = torch.from_numpy(observation['rgb']).unsqueeze(0) lar = torch.from_numpy( frame.get_action_reward(self.action_size)).unsqueeze(0) # whether to gpu if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): state = state.cuda() lar = lar.cuda() value, _, (_, _) = self.model(task_type='a3c', states=state, hx=self.hx, cx=self.cx, last_action_reward=lar) R = value.detach( ) # 这个值为 V(s_t,\theta_v^'), 在计算 actor 的梯度的时候可能会算入, 所以 detach if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): R = R.cuda() # values.append(R) # 用于 bootstrapping # 准备计算 loss policy_loss = 0 value_loss = 0 # 反向计算 loss for i in reversed(range(len(rewards))): # i=t - 1,...,t_start R = self.args.gamma * R + rewards[i] # R <- r_t + \gamma * R adv = R - values[ i] # GAE = advantage, R - V(s_i;\theta^'_v), adv 在反向传播的时候是 \theta_v的梯度, 在 policy 的梯度需要 detach value_loss += 0.5 * self.l2_loss( R, values[i]) # 定义为 0.5 * MSELOSS(R - value), 学习率是 actor 一半 log_prob_a = (log_probs[i] * action_one_hot[i]).sum( 1) # log(a_i|s_i;\theta^') policy_loss += -log_prob_a * adv.detach( ) + entropies[i] * 0.001 # entropy_beta return value_loss + policy_loss, episode_score
def _process_a3c(self, sess): states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = self.base_lstm_state_out for _ in range(LOCAL_T_MAX): last_action = self.env.last_action last_reward = self.env.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward(last_action, self.action_n, last_reward) pi_, value_ = self.run_base_policy_and_value(sess, self.env.last_state, last_action_reward) action = choose_action(pi_) states.append(self.env.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) prev_state = self.env.last_state # Process game new_state, reward, terminal, pixel_change = self.env.process(action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward rewards.append(reward) if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self.episode_reward = 0 self.env.reset() self.reset_state() break R = 0.0 if not terminal_end: R = self.run_base_value(sess, new_state, frame.get_last_action_reward(self.action_n)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R adv = R - Vi a = np.zeros([self.action_n]) a[ai] = 1.0 batch_si.append(si) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state