Ejemplo n.º 1
0
def train_agent_sac(
        agent_class,
        env_name,
        cwd,
        net_dim,
        max_step,
        max_memo,
        max_epoch,  # env
        batch_size,
        gamma,
        reward_scale,
        **_kwargs):  # 2020-0430
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward = get_env_info(env)

    agent = agent_class(env, state_dim, action_dim, net_dim)

    memo = MemoryArray(max_memo, state_dim, action_dim)
    recorder = Recorder(agent,
                        max_step,
                        max_action,
                        target_reward,
                        env_name,
                        show_gap=2**6)

    uniform_exploration(env, max_step, max_action, gamma, reward_scale, memo,
                        action_dim)
    try:
        for epoch in range(max_epoch):
            with torch.no_grad():  # for saving the GPU memory
                rewards, steps = agent.update_memory(env, memo, max_step,
                                                     max_action, reward_scale,
                                                     gamma)

            loss_a, loss_c = agent.update_parameter(memo, max_step, batch_size)

            with torch.no_grad():  # for saving the GPU memory
                recorder.show_reward(rewards, steps, loss_a, loss_c)
                is_solved = recorder.check_reward(cwd, loss_a, loss_c)

            if is_solved:
                break
    except KeyboardInterrupt:
        print("raise KeyboardInterrupt while training.")
    # except AssertionError:  # for BipedWalker BUG 2020-03-03
    #     print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")
    #     return False

    train_time = recorder.show_and_save(env_name, cwd)

    # agent.save_or_load_model(cwd, is_save=True)  # save max reward agent in Recorder
    # memo.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
    return True
Ejemplo n.º 2
0
def train_agent(agent_class, env_name, cwd, net_dim, max_step, max_memo, max_epoch,  # env
                batch_size, update_gap, gamma, exp_noise, pol_noise, reward_scale,  # update
                **_kwargs):  # 2020-0430
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward = get_env_info(env)

    agent = agent_class(state_dim, action_dim, net_dim)
    agent.save_or_load_model(cwd, is_save=False)

    memo_action_dim = action_dim if max_action else 1  # Discrete action space
    memo = Memories(max_memo, memo_dim=1 + 1 + state_dim + memo_action_dim + state_dim)
    memo.save_or_load_memo(cwd, is_save=False)

    recorder = Recorder(agent, max_step, max_action, target_reward, env_name)
    r_norm = RewardNormalization(n_max=target_reward, n_min=recorder.reward_avg, size=reward_scale)

    try:
        for epoch in range(max_epoch):
            with torch.no_grad():  # just the GPU memory
                rewards, steps = agent.inactive_in_env(
                    env, memo, max_step, exp_noise, max_action, r_norm)
                memo.refresh_indices()

            actor_loss, critic_loss = agent.update_parameter(
                memo, sum(steps), batch_size, pol_noise, update_gap, gamma)

            if np.isnan(actor_loss) or np.isnan(critic_loss):
                print("ValueError: loss value should not be 'nan'. Please run again.")
                return False

            with torch.no_grad():  # just the GPU memory
                # is_solved = recorder.show_and_check_reward(
                #     epoch, epoch_reward, iter_num, actor_loss, critic_loss, cwd)
                recorder.show_reward(epoch, rewards, steps, actor_loss, critic_loss)
                is_solved = recorder.check_reward(cwd, actor_loss, critic_loss)
                if is_solved:
                    break

    except KeyboardInterrupt:
        print("raise KeyboardInterrupt while training.")
    except AssertionError:  # for BipedWalker BUG 2020-03-03
        print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")
        return False

    train_time = recorder.show_and_save(env_name, cwd)

    # agent.save_or_load_model(cwd, is_save=True)  # save max reward agent in Recorder
    memo.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
    return True
Ejemplo n.º 3
0
def train_agent_ppo(agent_class, env_name, cwd, net_dim, max_step, max_memo, max_epoch,  # env
                    batch_size, gamma,
                    **_kwargs):  # 2020-0430
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward = get_env_info(env)

    agent = agent_class(state_dim, action_dim, net_dim)
    agent.save_or_load_model(cwd, is_save=False)

    # memo_action_dim = action_dim if max_action else 1  # Discrete action space
    # memo = Memories(max_memo, memo_dim=1 + 1 + state_dim + memo_action_dim + state_dim)
    # memo.save_or_load_memo(cwd, is_save=False)

    state_norm = AutoNormalization((state_dim,), clip=6.0)
    recorder = Recorder(agent, max_step, max_action, target_reward, env_name,
                        state_norm=state_norm)
    # r_norm = RewardNorm(n_max=target_reward, n_min=recorder.reward_avg)
    try:
        for epoch in range(max_epoch):
            with torch.no_grad():  # just the GPU memory
                rewards, steps, memory = agent.inactive_in_env_ppo(
                    env, max_step, max_memo, max_action, state_norm)

            l_total, l_value = agent.update_parameter_ppo(
                memory, batch_size, gamma, ep_ratio=1 - epoch / max_epoch)

            if np.isnan(l_total) or np.isnan(l_value):
                print("ValueError: loss value should not be 'nan'. Please run again.")
                return False

            with torch.no_grad():  # for saving the GPU memory
                recorder.show_reward(epoch, rewards, steps, l_value, l_total)
                is_solved = recorder.check_reward(cwd, l_value, l_total)
                if is_solved:
                    print(';;;', is_solved)
                    break

    except KeyboardInterrupt:
        print("raise KeyboardInterrupt while training.")
    except AssertionError:  # for BipedWalker BUG 2020-03-03
        print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")
        return False

    train_time = recorder.show_and_save(env_name, cwd)

    # agent.save_or_load_model(cwd, is_save=True)  # save max reward agent in Recorder
    # memo.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
    return True