コード例 #1
0
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env.seed(args.seed)

"""define actor and critic"""
if args.linear:
    hidden_size=()
else:
    hidden_size=(64,)

if args.model_path is None:
    policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std, hidden_size=hidden_size)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb"))
policy_net.to(device)
value_net.to(device)

"""create agent"""
agent = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads)


def update_params(batch):
    states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
    rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device)
    masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
    with torch.no_grad():
        values = value_net(states)

    """get advantage estimation from the trajectories"""
コード例 #2
0
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env.action_space.n)
    else:
        policy_mgr = DiscretePolicy(state_dim, 7)
        policy_wrk = Policy(state_dim + subgoal_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std)
    value_mgr = Value(state_dim)
    value_wrk = Value(state_dim + subgoal_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_mgr.to(device)
policy_wrk.to(device)
value_mgr.to(device)
value_wrk.to(device)

# optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01)
# optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01)
# optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01)
# optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01)
"""create agent"""
agent = Agent(env,
              policy_mgr,
              policy_wrk,
              device,
              running_state=running_state,
              render=args.render,
              num_threads=args.num_threads)
コード例 #3
0
ファイル: learn_model.py プロジェクト: Quanticnova/td-reg
def learn_model(args):

    print("RL result will be saved at %s" % args.rl_filename)
    print("RL model will be saved at %s" % args.rl_model_filename)
    if use_gpu:
        print("Using CUDA.")

    torch.manual_seed(args.rl_seed)
    if use_gpu:
        torch.cuda.manual_seed_all(args.rl_seed)
        torch.backends.cudnn.deterministic = True
    np.random.seed(args.rl_seed)
    random.seed(args.rl_seed)

    env = gym.make(args.env_name)
    env.seed(args.rl_seed)

    env_test = gym.make(args.env_name)
    env_test.seed(args.rl_seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    a_bound = np.asscalar(env.action_space.high[0])
    a_low = np.asscalar(env.action_space.low[0])
    assert a_bound == -a_low

    ## Binary flag for manually cliping actions for step function after adding Gaussian noise.
    clip = (args.env_name == "LunarLanderContinuous-v2"
            or args.env_name == "BipedalWalker-v2")

    print(env.observation_space)
    print(env.action_space)
    """define actor and critic"""
    policy_net = Policy(state_dim,
                        action_dim,
                        log_std=args.log_std,
                        a_bound=a_bound,
                        hidden_size=args.hidden_size,
                        activation=args.activation).to(device)
    value_net = Value(state_dim,
                      hidden_size=args.hidden_size,
                      activation=args.activation).to(device)

    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate_v)
    decayed_lambda_td = args.lambda_td

    def update_params_c(batch, i_iter):
        states = torch.from_numpy(np.stack(batch.state)).float().to(device)
        actions = torch.from_numpy(np.stack(batch.action)).float().to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device)
        masks = torch.from_numpy(np.stack(batch.mask).astype(
            np.float32)).to(device)
        """get advantage estimation from the trajectories"""
        values = value_net(states).data
        advantages, lambda_returns, mc_returns = estimate_advantages(
            rewards, masks, values, args.gamma, args.tau)

        if args.lamret:
            returns = lambda_returns
        else:
            returns = mc_returns
        """perform critic update"""
        #gae_step(value_net, optimizer_value, states, lambda_returns, args.l2_reg)  # full batch GD
        gae_step_epoch(value_net, optimizer_value, states, returns,
                       args.l2_reg)  # Stochastic GD

    """ Function to update the parameters of value and policy networks"""

    def update_params_p(batch, i_iter):

        nonlocal decayed_lambda_td

        states = torch.from_numpy(np.stack(batch.state)).float().to(device)
        actions = torch.from_numpy(np.stack(batch.action)).float().to(device)
        next_states = torch.from_numpy(np.stack(
            batch.next_state)).float().to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device)
        masks = torch.from_numpy(np.stack(batch.mask).astype(
            np.float32)).to(device)
        """get advantage estimation from the trajectories, this is done after gae_step update"""
        values = value_net(states).data
        advantages, lambda_returns, mc_returns = estimate_advantages(
            rewards, masks, values, gamma=args.gamma, tau=args.tau)

        if args.method_name == "TRPO-RET-MC":
            returns = mc_returns.detach(
            )  # detach() does not matter since we back prop policy network only.
        elif args.method_name == "TRPO-RET-GAE":
            returns = lambda_returns.detach(
            )  # detach() does not matter actually.
        else:
            returns = 0  # returns is not used for TRPO and TRPO-TD.

        # standardize or not ?
        if args.mgae:
            advantages = (advantages - advantages.mean()
                          ) / advantages.std()  # this will be m-std version
        else:
            advantages = advantages / advantages.std(
            )  # this will be std version

        trpo_step_td(policy_net=policy_net, value_net=value_net, states=states, actions=actions, next_states=next_states, rewards=rewards, masks=masks, gamma=args.gamma, advantages=advantages, \
            max_kl=args.max_kl, damping=args.damping, \
            lambda_td=decayed_lambda_td, method_name=args.method_name, returns=returns, mtd=args.mtd)
        """ decay the td_reg parameter after update """
        decayed_lambda_td = decayed_lambda_td * args.decay_td

    """create agent"""
    agent = Agent(env, policy_net, render=False)
    agent_test = Agent(env_test,
                       policy_net,
                       mean_action=True,
                       render=args.render)
    """ The actual learning loop"""
    for i_iter in range(args.rl_max_iter_num):
        """ Save the learned policy model """
        if ( (i_iter) % args.rl_save_model_interval == 0 and args.rl_save_model_interval > 0 ) \
            or (i_iter == args.rl_max_iter_num + 1) or i_iter == 0:

            policy_net = policy_net.to(device_cpu)
            value_net = value_net.to(device_cpu)

            pickle.dump((policy_net, value_net),
                        open(args.rl_model_filename + ("_I%d.p" % (i_iter)),
                             'wb'))

            policy_net = policy_net.to(device)
            value_net = value_net.to(device)
        """ Test the policy before update """
        if i_iter % args.log_interval == 0 or i_iter + 1 == args.rl_max_iter_num:
            _, log_test = agent_test.collect_samples_test(max_num_episodes=20,
                                                          render=args.render,
                                                          clip=clip)
        """generate multiple trajectories that reach the minimum batch_size"""
        t0 = time.time()
        batch, log = agent.collect_samples_train(
            args.min_batch_size, render=False,
            clip=clip)  # this is on-policy samples
        t1 = time.time()
        """ update parameters """
        t0_d = time.time()
        update_params_c(batch, i_iter)  #critic update
        update_params_p(batch, i_iter)  #actor update
        t1_d = time.time()
        """ Print out result to stdout and save it to a text file for later usage"""
        if i_iter % args.log_interval == 0:

            result_text = t_format("Iter %6d (%2.2fs)+(%2.2fs)" %
                                   (i_iter, t1 - t0, t1_d - t0_d))
            result_text += " | [R] " + t_format(
                "Avg: %.2f (%.2f)" % (log['avg_reward'], log['std_reward']), 2)
            result_text += " | [R_test] " + t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \
                            + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2)
            print(result_text)

            with open(args.rl_filename, 'a') as f:
                print(result_text, file=f)