def train(cfg, envs): env = gym.make(cfg.env) # a single env env.seed(10) state_dim = envs.observation_space.shape[0] action_dim = envs.action_space.n model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(cfg.device) optimizer = optim.Adam(model.parameters()) frame_idx = 0 test_rewards = [] test_ma_rewards = [] state = envs.reset() while frame_idx < cfg.max_frames: log_probs = [] values = [] rewards = [] masks = [] entropy = 0 # rollout trajectory for _ in range(cfg.n_steps): state = torch.FloatTensor(state).to(cfg.device) dist, value = model(state) action = dist.sample() next_state, reward, done, _ = envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append( torch.FloatTensor(reward).unsqueeze(1).to(cfg.device)) masks.append( torch.FloatTensor(1 - done).unsqueeze(1).to(cfg.device)) state = next_state frame_idx += 1 if frame_idx % 100 == 0: test_reward = np.mean( [test_env(env, model) for _ in range(10)]) print(f"frame_idx:{frame_idx}, test_reward:{test_reward}") test_rewards.append(test_reward) if test_ma_rewards: test_ma_rewards.append(0.9 * test_ma_rewards[-1] + 0.1 * test_reward) else: test_ma_rewards.append(test_reward) # plot(frame_idx, test_rewards) next_state = torch.FloatTensor(next_state).to(cfg.device) _, next_value = model(next_state) returns = compute_returns(next_value, rewards, masks) log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy optimizer.zero_grad() loss.backward() optimizer.step() return test_rewards, test_ma_rewards
class A2C: def __init__(self, state_dim, action_dim, cfg) -> None: self.gamma = cfg.gamma self.device = cfg.device self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device) self.optimizer = optim.Adam(self.model.parameters()) def compute_returns(self, next_value, rewards, masks): R = next_value returns = [] for step in reversed(range(len(rewards))): R = rewards[step] + self.gamma * R * masks[step] returns.insert(0, R) return returns
class A2C: def __init__(self,state_dim, action_dim, cfg): self.gamma = 0.99 self.model = ActorCritic(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(cfg.device) self.optimizer = optim.Adam(self.model.parameters(),lr=cfg.lr) def choose_action(self, state): dist, value = self.model(state) action = dist.sample() return action def compute_returns(self,next_value, rewards, masks): R = next_value returns = [] for step in reversed(range(len(rewards))): R = rewards[step] + self.gamma * R * masks[step] returns.insert(0, R) return returns def update(self): pass
def __init__(self,state_dim, action_dim, cfg): self.gamma = 0.99 self.model = ActorCritic(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(cfg.device) self.optimizer = optim.Adam(self.model.parameters(),lr=cfg.lr)
def __init__(self, state_dim, action_dim, cfg) -> None: self.gamma = cfg.gamma self.device = cfg.device self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device) self.optimizer = optim.Adam(self.model.parameters())