Ejemplo n.º 1
0
def update_params(batch_mgr, batch_wrk):
    states_mgr = torch.from_numpy(np.stack(batch_mgr.state)).to(dtype).to(device)
    subgoals = torch.from_numpy(np.stack(batch_mgr.action)).to(dtype).to(device)
    rewards_mgr = torch.from_numpy(np.stack(batch_mgr.reward)).to(dtype).to(device)
    masks_mgr = torch.from_numpy(np.stack(batch_mgr.mask)).to(dtype).to(device)

    states_wrk = torch.from_numpy(np.stack(batch_wrk.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch_wrk.action)).to(dtype).to(device)
    rewards_wrk = torch.from_numpy(np.stack(batch_wrk.reward)).to(dtype).to(device)
    masks_wrk = torch.from_numpy(np.stack(batch_wrk.mask)).to(dtype).to(device)

    with torch.no_grad():
        values_mgr = value_mgr(states_mgr)
        values_wrk = value_wrk(states_wrk)

    """get advantage estimation from the trajectories"""
    advantages_mgr, returns_mgr = estimate_advantages(rewards_mgr, masks_mgr, values_mgr, args.gamma, args.tau, device)
    advantages_wrk, returns_wrk = estimate_advantages(rewards_wrk, masks_wrk, values_wrk, args.gamma, args.tau, device)

    #print (torch.sum(torch.isnan(advantages_mgr)*1.0), torch.sum(torch.isnan(returns_mgr)*1.0))
    #print (torch.sum(torch.isnan(advantages_wrk)*1.0), torch.sum(torch.isnan(returns_wrk)*1.0))

    """perform TRPO update"""
    policy_loss_m = 0
    policy_loss_m, value_loss_m = a2c_step(policy_mgr, value_mgr, optim_policy_m, optim_value_m, states_mgr, subgoals, returns_mgr, advantages_mgr, args.l2_reg)
    policy_loss_w, value_loss_w = a2c_step(policy_wrk, value_wrk, optim_policy_w, optim_value_w, states_wrk, actions, returns_wrk, advantages_wrk, args.l2_reg)

    return policy_loss_m, policy_loss_w
Ejemplo n.º 2
0
def update_params(batch):
    states = Tensor(batch.state)
    actions = ActionTensor(batch.action)
    rewards = Tensor(batch.reward)
    masks = Tensor(batch.mask)
    values = value_net(Variable(states, volatile=True)).data
    """get advantage estimation from the trajectories"""
    advantages, returns = estimate_advantages(rewards, masks, values,
                                              args.gamma, args.tau, Tensor)
    """perform TRPO update"""
    a2c_step(policy_net, value_net, optimizer_policy, optimizer_value, states,
             actions, returns, advantages, args.l2_reg)
Ejemplo n.º 3
0
def update_params(batch):
    states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
    rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device)
    masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
    with torch.no_grad():
        values = value_net(states)
    """get advantage estimation from the trajectories"""
    advantages, returns = estimate_advantages(rewards, masks, values,
                                              args.gamma, args.tau, device)
    """perform TRPO update"""
    a2c_step(policy_net, value_net, optimizer_policy, optimizer_value, states,
             actions, returns, advantages, args.l2_reg)
Ejemplo n.º 4
0
def update_params(batch):
    states = torch.from_numpy(np.stack(batch.state))
    actions = torch.from_numpy(np.stack(batch.action))
    rewards = torch.from_numpy(np.stack(batch.reward))
    masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64))
    if use_gpu:
        states, actions, rewards, masks = states.cuda(), actions.cuda(
        ), rewards.cuda(), masks.cuda()
    values = value_net(Variable(states, volatile=True)).data
    """get advantage estimation from the trajectories"""
    advantages, returns = estimate_advantages(rewards, masks, values,
                                              args.gamma, args.tau, use_gpu)
    """perform TRPO update"""
    a2c_step(policy_net, value_net, optimizer_policy, optimizer_value, states,
             actions, returns, advantages, args.l2_reg)
Ejemplo n.º 5
0
def update_params(batch):

    states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
    rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device)
    masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)

    with torch.no_grad():
        values = value_net(states)

    advantages, returns = estimate_advantages(rewards, masks, values,
                                              exp_args["config"]["gamma"],
                                              exp_args["config"]["tau"],
                                              device)

    a2c_step(policy_net, value_net, optimizer_policy, optimizer_value, states,
             actions, returns, advantages, exp_args["config"]["l2-reg"])
Ejemplo n.º 6
0
def update_params(batch_mgr, batch_wrk):
    states_mgr = torch.from_numpy(np.stack(
        batch_mgr.state)).to(dtype).to(device)
    directions = torch.from_numpy(np.stack(
        batch_mgr.action)).to(dtype).to(device)
    rewards_mgr = torch.from_numpy(np.stack(
        batch_mgr.reward)).to(dtype).to(device)
    masks_mgr = torch.from_numpy(np.stack(batch_mgr.mask)).to(dtype).to(device)

    states_wrk = torch.from_numpy(np.stack(
        batch_wrk.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch_wrk.action)).to(dtype).to(device)
    rewards_wrk = torch.from_numpy(np.stack(
        batch_wrk.reward)).to(dtype).to(device)
    masks_wrk = torch.from_numpy(np.stack(batch_wrk.mask)).to(dtype).to(device)

    with torch.no_grad():
        values_mgr = value_mgr(states_mgr)
        fixed_logprobs_mgr = policy_mgr.get_log_prob(states_mgr, directions)
        values_wrk = value_wrk(states_wrk)
        fixed_logprobs_wrk = policy_wrk.get_log_prob(states_wrk, actions)
    """get advantage estimation from the trajectories"""
    advantages_mgr, returns_mgr = estimate_advantages(rewards_mgr, masks_mgr,
                                                      values_mgr, args.gamma,
                                                      args.tau, device)
    advantages_wrk, returns_wrk = estimate_advantages(rewards_wrk, masks_wrk,
                                                      values_wrk, args.gamma,
                                                      args.tau, device)

    #print (torch.sum(torch.isnan(advantages_mgr)*1.0), torch.sum(torch.isnan(returns_mgr)*1.0))
    #print (torch.sum(torch.isnan(advantages_wrk)*1.0), torch.sum(torch.isnan(returns_wrk)*1.0))
    """perform TRPO update"""
    #policy_loss_m, value_loss_m = a2c_step(policy_mgr, value_mgr, optim_policy_m, optim_value_m, states_mgr, directions, returns_mgr, advantages_mgr, args.l2_reg)
    policy_loss_w, value_loss_w = a2c_step(policy_wrk, value_wrk,
                                           optim_policy_w, optim_value_w,
                                           states_wrk, actions, returns_wrk,
                                           advantages_wrk, args.l2_reg)
    optim_iter_mgr = int(math.ceil(states_mgr.shape[0] / optim_batch_size))
    #optim_iter_wrk = int(math.ceil(states_wrk.shape[0] / optim_batch_size))
    for _ in range(optim_epochs):
        perm_mgr = np.arange(states_mgr.shape[0])
        np.random.shuffle(perm_mgr)
        perm_mgr = LongTensor(perm_mgr).to(device)

        #perm_wrk = np.arange(states_wrk.shape[0])
        #np.random.shuffle(perm_wrk)
        #perm_wrk = LongTensor(perm_wrk).to(device)

        states_mgr, directions, returns_mgr, advantages_mgr, fixed_logprobs_mgr = \
            states_mgr[perm_mgr].clone(), directions[perm_mgr].clone(), returns_mgr[perm_mgr].clone(), advantages_mgr[perm_mgr].clone(), fixed_logprobs_mgr[perm_mgr].clone()
        #states_wrk, actions, returns_wrk, advantages_wrk, fixed_logprobs_wrk = \
        #    states_wrk[perm_wrk].clone(), actions[perm_wrk].clone(), returns_wrk[perm_wrk].clone(), advantages_wrk[perm_wrk].clone(), fixed_logprobs_wrk[perm_wrk].clone()

        for i in range(optim_iter_mgr):
            ind = slice(i * optim_batch_size,
                        min((i + 1) * optim_batch_size, states_mgr.shape[0]))
            states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                states_mgr[ind], directions[ind], advantages_mgr[ind], returns_mgr[ind], fixed_logprobs_mgr[ind]

            ppo_step(policy_mgr, value_mgr, optim_policy_m, optim_value_m, 1,
                     states_b, actions_b, returns_b, advantages_b,
                     fixed_log_probs_b, args.clip_epsilon, args.l2_reg)
Ejemplo n.º 7
0
def update_params(batch):
    for i in range(len(policy_net)):
        policy_net[i].train()
        value_net[i].train()

    # states = torch.from_numpy(np.stack(batch.state))
    # actions = torch.from_numpy(np.stack(batch.action))
    # rewards = torch.from_numpy(np.stack(batch.reward))
    # masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64))
    states = to_tensor_var(batch.state,True,"double").view(-1, agent.n_agents, agent.obs_shape_n[0]).data
    actions = to_tensor_var(batch.action,True,"long").view(-1, agent.n_agents, 1).data
    rewards = to_tensor_var(batch.reward,True,"double").view(-1, agent.n_agents, 1).data
    masks = to_tensor_var(batch.mask,True,"double").view(-1, agent.n_agents, 1).data

    whole_states_var = states.view(-1, agent.whole_critic_state_dim)
    whole_actions_var = actions.view(-1, agent.whole_critic_action_dim)

    # print( whole_states_var, whole_actions_var )



    if use_gpu:
        states, actions, rewards, masks = states.cuda(), actions.cuda(), rewards.cuda(), masks.cuda()
        whole_states_var, whole_actions_var = whole_states_var.cuda(), whole_actions_var.cuda()
    # values = value_net(Variable(whole_states_var, volatile=True)).data
    values = []
    for i in range(len(value_net)):
        # values.append(value_net[i](th.Tensor(whole_states_var)).data)
        # input = Variable(whole_states_var, volatile=True)
        values.append(value_net[i](Variable(whole_states_var)))

    # print(rewards, masks, values)
    # values = to_tensor_var(values,True,"double").view(-1, agent.n_agents, 1).data

    # Transpose!
    values_tmp = [[r[col] for r in values] for col in range(len(values[0]))]
    values = to_tensor_var(values_tmp,True,"double").view(-1, agent.n_agents,1 ).data.cuda()

    """get advantage estimation from the trajectories"""
    # advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, use_gpu)
    advantages, returns = [], []
    for i in range(len(value_net)):
        adv, ret = estimate_advantages(rewards[:,i,:], masks[:,i,:], values[:,i,:], args.gamma, args.tau, use_gpu)
        advantages.append(adv)
        returns.append(ret)
    #print(advantages, returns)

    # Transpose!
    advantages = [[r[col] for r in advantages] for col in range(len(advantages[0]))]
    advantages = to_tensor_var(advantages,True,"double").view(-1, agent.n_agents,1 ).data.cuda()

    # Transpose!
    returns = [[r[col] for r in returns] for col in range(len(returns[0]))]
    returns = to_tensor_var(returns,True,"double").view(-1, agent.n_agents,1 ).data.cuda()

    # # combine n agent's related advantages together
    # tmp_ary = np.empty_like(advantages[0])
    # for i in range(len(advantages)):
    #     tmp_ary = np.hstack((tmp_ary, advantages[i]))
    # advantages = tmp_ary[:,1:len(value_net)+1]

    # tmp_ary = np.empty_like(returns[0])
    # for i in range(len(returns)):
    #     tmp_ary = np.hstack((tmp_ary, returns[i]))
    # returns = tmp_ary[:,1:len(value_net)+1]

    # advantages = to_tensor_var(advantages, True, "double").view(-1, agent.n_agents, 1).data.cuda()
    # returns = to_tensor_var(returns, True, "double").view(-1, agent.n_agents, 1).data.cuda()

    """perform TRPO update"""
    for i in range(len(value_net)):
        # a2c_step(policy_net[i], value_net[i], optimizer_policy[i], optimizer_value[i], states[:,i,:], actions[:,i,:], returns[:,i,:], advantages[:,i,:], args.l2_reg)
        a2c_step(policy_net[i], value_net[i], optimizer_policy[i], optimizer_value[i], states,
                 actions, returns[:,i,:], advantages[:,i,:], args.l2_reg, i)