Ejemplo n.º 1
0
def main():
    # initialize the game
    env = gym.make('Pendulum-v0').unwrapped
    print env.observation_space
    print env.observation_space.high
    print env.observation_space.low
    print env.action_space
    # import hyper parameters
    args = init_hyper_para()
    # random initialize critic network
    state_dim = env.reset().shape[0]
    action_dim = env.action_space.shape[0]
    #  if we have the saved model, load it
    if os.path.exists(
            '/home/likang/PycharmProjects/myddpg/bin/Models/critic.ckpt'):
        critic_net = torch.load(
            '/home/likang/PycharmProjects/myddpg/bin/Models/critic.ckpt')
    else:  # initialize the model
        critic_net = net.CriticNetwork(
            state_dim=state_dim, action_dim=action_dim).to(
                device)  # need to init paras according to the gym game
    # random initialize actor network(also called policy network)
    if os.path.exists(
            '/home/likang/PycharmProjects/myddpg/bin/Models/actor.ckpt'):
        actor_net = torch.load(
            '/home/likang/PycharmProjects/myddpg/bin/Models/actor.ckpt')
    else:
        actor_net = net.ActorNetwork(state_dim=state_dim,
                                     action_dim=action_dim).to(device)
    # initialize
    optimizer_critic = opt.Adam(critic_net.parameters(), lr=0.001)
    optimizer_actor = opt.Adam(actor_net.parameters(), lr=0.001)
    # initialize target critic network which is the same of critic network
    target_critic_net = copy.deepcopy(critic_net)
    # initialize target actor network which is the same of actor network
    target_actor_net = copy.deepcopy(actor_net)
    # init the memory buffer
    memory = Memory(args.capacity)
    # initialize a random process N for action exploration
    ounoise = OUNoise(env.action_space.shape[0])  # init random process
    # enter circle of training process
    for ep in range(args.num_ep):
        print(["ep: ", ep])
        # reset random process
        ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
            0, args.exploration_end -
            ep) / args.exploration_end + args.final_noise_scale
        ounoise.reset()
        # initialize a state s1
        state = env.reset()  # 这里把state初始化成二维的tensor
        state = torch.tensor([state], dtype=torch.float32).to(device)
        for t in range(MAX_STEP):
            print(['time step: ', t])
            # select a action according to actor network(also called policy network)
            action = actor_net.select_action(state, ounoise)
            # execute the action and get a new state s_i+i
            # get a reward from the environment
            next_state, reward, done, _ = env.step([action.item()])
            # store the transition {s_i, a_i, r_i, s_i+1} into memory
            next_state = torch.tensor([next_state],
                                      device=device,
                                      dtype=torch.float32)
            reward = torch.tensor([[reward]],
                                  device=device,
                                  dtype=torch.float32)
            memory.push(state, action, reward, next_state)
            state = next_state
            # print([state, action, reward, next_state])
            del action, reward, next_state
            # get a batch_size transitions.
            # (s_i, a_i, r_i, s_{i+1}) in Algorithm1 of DDPG
            transitions = memory.sample(args.batch_size)
            s1 = torch.cat([tran.state for tran in transitions])
            s2 = torch.cat([tran.next_state for tran in transitions])
            r1 = torch.cat([tran.reward for tran in transitions])
            a1 = torch.cat([tran.action for tran in transitions])
            update_critic_net(s1, s2, r1, a1, target_actor_net,
                              target_critic_net, critic_net, optimizer_critic,
                              args)
            # update actor policy network
            update_actor_net(s1, actor_net, critic_net, optimizer_actor)
            # update target critic network
            # theta^{Q'}, see algorithm1 of DDPG
            for target_param, source_param in zip(
                    target_critic_net.parameters(), critic_net.parameters()):
                target_param.data.copy_(args.tau * source_param +
                                        (1 - args.tau) * target_param)
            # update target actor network
            # theta^{mu'}, see algorithm1 of DDPG
            for target_param, source_param in zip(
                    target_actor_net.parameters(), actor_net.parameters()):
                target_param.data.copy_(args.tau * source_param +
                                        (1 - args.tau) * target_param)
            # show image
            plt.imshow(env.render('rgb_array'))
            time.sleep(0.001)
            # finish
            if done:
                break
            del transitions
        gc.collect()

        if ep % 10 == 0:  # save model
            torch.save(critic_net, './Models/' + 'critic.ckpt')
            torch.save(actor_net, './Models/' + 'actor.ckpt')
Ejemplo n.º 2
0
def main(cfg):
    random.seed(cfg.exp.seed)
    np.random.seed(cfg.exp.seed)
    torch.manual_seed(cfg.exp.seed)
    torch.backends.cudnn.deterministic = cfg.exp.torch_deterministic

    # so that the environment automatically resets
    env = SyncVectorEnv([
        lambda: RecordEpisodeStatistics(gym.make('CartPole-v1'))
    ])

    actor, critic = Actor(), Critic()
    actor_optim = Adam(actor.parameters(), eps=1e-5, lr=cfg.params.actor_lr)
    critic_optim = Adam(critic.parameters(), eps=1e-5, lr=cfg.params.critic_lr)
    memory = Memory(mini_batch_size=cfg.params.mini_batch_size, batch_size=cfg.params.batch_size)
    obs = env.reset()
    global_rewards = []

    NUM_UPDATES = (cfg.params.total_timesteps // cfg.params.batch_size) * cfg.params.epochs
    cur_timestep = 0

    def calc_factor(cur_timestep: int) -> float:
        """Calculates the factor to be multiplied with the learning rate to update it."""
        update_number = cur_timestep // cfg.params.batch_size
        total_updates = cfg.params.total_timesteps // cfg.params.batch_size
        fraction = 1.0 - update_number / total_updates
        return fraction

    actor_scheduler = LambdaLR(actor_optim, lr_lambda=calc_factor, verbose=True)
    critic_scheduler = LambdaLR(critic_optim, lr_lambda=calc_factor, verbose=True)

    while cur_timestep < cfg.params.total_timesteps:
        # keep playing the game
        obs = torch.as_tensor(obs, dtype=torch.float32)
        with torch.no_grad():
            dist = actor(obs)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            value = critic(obs)
        action = action.cpu().numpy()
        value = value.cpu().numpy()
        log_prob = log_prob.cpu().numpy()
        obs_, reward, done, info = env.step(action)
        
        if done[0]:
            tqdm.write(f'Reward: {info[0]["episode"]["r"]}, Avg Reward: {np.mean(global_rewards[-10:]):.3f}')
            global_rewards.append(info[0]['episode']['r'])
            wandb.log({'Avg_Reward': np.mean(global_rewards[-10:]), 'Reward': info[0]['episode']['r']})

        memory.remember(obs.squeeze(0).cpu().numpy(), action.item(), log_prob.item(), reward.item(), done.item(), value.item())
        obs = obs_
        cur_timestep += 1

        # if the current timestep is a multiple of the batch size, then we need to update the model
        if cur_timestep % cfg.params.batch_size == 0:
            for epoch in tqdm(range(cfg.params.epochs), desc=f'Num updates: {cfg.params.epochs * (cur_timestep // cfg.params.batch_size)} / {NUM_UPDATES}'):
                # sample a batch from memory of experiences
                old_states, old_actions, old_log_probs, old_rewards, old_dones, old_values, batch_indices = memory.sample()
                old_log_probs = torch.tensor(old_log_probs, dtype=torch.float32)
                old_actions = torch.tensor(old_actions, dtype=torch.float32)
                advantage = calculate_advantage(old_rewards, old_values, old_dones, gae_gamma=cfg.params.gae_gamma, gae_lambda=cfg.params.gae_lambda)
                
                advantage = torch.tensor(advantage, dtype=torch.float32)
                old_rewards = torch.tensor(old_rewards, dtype=torch.float32)
                old_values = torch.tensor(old_values, dtype=torch.float32)

                # for each mini batch from batch, calculate advantage using GAE
                for mini_batch_index in batch_indices:
                    # remember: Normalization of advantage is done on mini batch, not the entire batch
                    advantage[mini_batch_index] = (advantage[mini_batch_index] - advantage[mini_batch_index].mean()) / (advantage[mini_batch_index].std() + 1e-8)

                    dist = actor(torch.tensor(old_states[mini_batch_index], dtype=torch.float32).unsqueeze(0))
                    # actions = dist.sample()
                    log_probs = dist.log_prob(old_actions[mini_batch_index]).squeeze(0)
                    entropy = dist.entropy().squeeze(0)

                    log_ratio = log_probs - old_log_probs[mini_batch_index]
                    ratio = torch.exp(log_ratio)

                    with torch.no_grad():
                        # approx_kl = ((ratio-1)-log_ratio).mean()
                        approx_kl = ((old_log_probs[mini_batch_index] - log_probs)**2).mean()
                        wandb.log({'Approx_KL': approx_kl})

                    actor_loss = -torch.min(
                        ratio * advantage[mini_batch_index],
                        torch.clamp(ratio, 1 - cfg.params.actor_loss_clip, 1 + cfg.params.actor_loss_clip) * advantage[mini_batch_index]
                    ).mean()

                    values = critic(torch.tensor(old_states[mini_batch_index], dtype=torch.float32).unsqueeze(0)).squeeze(-1)
                    returns = old_values[mini_batch_index] + advantage[mini_batch_index]

                    critic_loss = torch.max(
                        (values - returns)**2,
                        (old_values[mini_batch_index] + torch.clamp(
                            values - old_values[mini_batch_index], -cfg.params.critic_loss_clip, cfg.params.critic_loss_clip
                            ) - returns
                        )**2
                    ).mean()
                    # critic_loss = F.mse_loss(values, returns)

                    wandb.log({'Actor_Loss': actor_loss.item(), 'Critic_Loss': critic_loss.item(), 'Entropy': entropy.mean().item()})
                    loss = actor_loss + 0.25 * critic_loss - 0.01 * entropy.mean()
                    actor_optim.zero_grad()
                    critic_optim.zero_grad()
                    loss.backward()
                    nn.utils.clip_grad_norm_(actor.parameters(), cfg.params.max_grad_norm)
                    nn.utils.clip_grad_norm_(critic.parameters(), cfg.params.max_grad_norm)

                    actor_optim.step()
                    critic_optim.step()

            memory.reset()
            actor_scheduler.step(cur_timestep)
            critic_scheduler.step(cur_timestep)

            y_pred, y_true = old_values.cpu().numpy(), (old_values + advantage).cpu().numpy()
            var_y = np.var(y_true)
            explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
            wandb.log({'Explained_Var': explained_var})

    if cfg.exp.save_weights:
        torch.save(actor.state_dict(), Path(f'{hydra.utils.get_original_cwd()}/{cfg.exp.model_dir}/actor.pth'))
        torch.save(critic.state_dict(), Path(f'{hydra.utils.get_original_cwd()}/{cfg.exp.model_dir}/critic.pth'))