Ejemplo n.º 1
0
env.seed(args.seed)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env.action_space.n)
    else:
        policy_mgr = DiscretePolicy(state_dim, 7)
        policy_wrk = Policy(state_dim + subgoal_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std)
    value_mgr = Value(state_dim)
    value_wrk = Value(state_dim + subgoal_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_mgr.to(device)
policy_wrk.to(device)
value_mgr.to(device)
value_wrk.to(device)

# optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01)
# optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01)
# optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01)
# optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01)
"""create agent"""
agent = Agent(env,
              policy_mgr,
              policy_wrk,
              device,
              running_state=running_state,
              render=args.render,
Ejemplo n.º 2
0
is_disc_action = len(env_dummy.action_space.shape) == 0
running_state = ZFilter((state_dim, ), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env_dummy.action_space.shape[0],
                            log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_net.to(device)
value_net.to(device)
del env_dummy

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=0.01)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=0.01)
"""create agent"""
agent = Agent(env_factory,
              policy_net,
              device,
              running_state=running_state,
              render=args.render,
              num_threads=args.num_threads)


def update_params(batch):
Ejemplo n.º 3
0
def train(**kwargs):
    print('here')
    config = {
        "lr": kwargs['lr'],
        "gamma": kwargs['gamma']
    }
    dtype = torch.float64
    torch.set_default_dtype(dtype)
    device = torch.device('cuda', index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu')
    if torch.cuda.is_available():
        torch.cuda.set_device(args.gpu_index)

    """environment"""
    env = gym.make(args.env_name)
    state_dim = env.observation_space.shape[0]
    is_disc_action = len(env.action_space.shape) == 0
    running_state = ZFilter((state_dim,), clip=5)
    # running_reward = ZFilter((1,), demean=False, clip=10)

    """seeding"""
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    env.seed(args.seed)

    # """define actor and critic"""
    if args.model_path is None:
        if is_disc_action:
            policy_net = DiscretePolicy(state_dim, env.action_space.n)
        else:
            policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std)
        value_net = Value(state_dim)
    else:
        policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb"))
    policy_net.to(device)
    value_net.to(device)

    # optimization epoch number and batch size for PPO
    optim_epochs = 10
    optim_batch_size = 64

    """create agent"""
    agent = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads)
    def update_params(batch, i_iter, config):
        states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
        actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device)
        masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
        with torch.no_grad():
            values = value_net(states)
            fixed_log_probs = policy_net.get_log_prob(states, actions)

        """get advantage estimation from the trajectories"""
        advantages, returns = estimate_advantages(rewards, masks, values, config['gamma'], args.tau, device)

        """perform mini-batch PPO update"""
        optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size))
        for _ in range(optim_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).to(device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(i * optim_batch_size, min((i + 1) * optim_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

                ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b,
                        advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg)

    def main_loop(config):
        optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=config['lr'])
        optimizer_value = torch.optim.Adam(value_net.parameters(), lr=config['lr'])
        for i_iter in range(args.max_iter_num):
            """generate multiple trajectories that reach the minimum batch_size"""
            batch, log = agent.collect_samples(args.min_batch_size)
            t0 = time.time()
            update_params(batch, i_iter, config)
            t1 = time.time()

            if i_iter % args.log_interval == 0:
                print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format(
                    i_iter, log['sample_time'], t1-t0, log['min_reward'], log['max_reward'], log['avg_reward']))

            if args.save_model_interval > 0 and (i_iter+1) % args.save_model_interval == 0:
                to_device(torch.device('cpu'), policy_net, value_net)
                pickle.dump((policy_net, value_net, running_state),
                            open(os.path.join(assets_dir(), 'learned_models/{}_ppo.p'.format(args.env_name)), 'wb'))
                to_device(device, policy_net, value_net)

        #     """clean up gpu memory"""
            torch.cuda.empty_cache()
        return agent.evaluate()

    print('a')
    print(config)
    print(args)
    return main_loop(config)