コード例 #1
0
def update_params(batch_mgr, batch_wrk):
    states_mgr = torch.from_numpy(np.stack(batch_mgr.state)).to(dtype).to(device)
    directions = torch.from_numpy(np.stack(batch_mgr.action)).to(dtype).to(device)
    rewards_mgr = torch.from_numpy(np.stack(batch_mgr.reward)).to(dtype).to(device)
    masks_mgr = torch.from_numpy(np.stack(batch_mgr.mask)).to(dtype).to(device)

    states_wrk = torch.from_numpy(np.stack(batch_wrk.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch_wrk.action)).to(dtype).to(device)
    rewards_wrk = torch.from_numpy(np.stack(batch_wrk.reward)).to(dtype).to(device)
    masks_wrk = torch.from_numpy(np.stack(batch_wrk.mask)).to(dtype).to(device)

    with torch.no_grad():
        values_mgr = value_mgr(states_mgr)
        values_wrk = value_wrk(states_wrk)

    """get advantage estimation from the trajectories"""
    advantages_mgr, returns_mgr = estimate_advantages(rewards_mgr, masks_mgr, values_mgr, args.gamma, args.tau, device)
    advantages_wrk, returns_wrk = estimate_advantages(rewards_wrk, masks_wrk, values_wrk, args.gamma, args.tau, device)

    #print (torch.sum(torch.isnan(advantages_mgr)*1.0), torch.sum(torch.isnan(returns_mgr)*1.0))
    #print (torch.sum(torch.isnan(advantages_wrk)*1.0), torch.sum(torch.isnan(returns_wrk)*1.0))

    """perform TRPO update"""
    policy_loss_m = 0
    trpo_step(policy_mgr, value_mgr, states_mgr, directions, returns_mgr, advantages_mgr, args.max_kl, args.damping, args.l2_reg)

    trpo_step(policy_wrk, value_wrk, states_wrk, actions, returns_wrk, advantages_wrk, args.max_kl, args.damping, args.l2_reg)
コード例 #2
0
ファイル: trpo_gym.py プロジェクト: amoliu/PyTorch-RL-1
def update_params(batch):
    states = Tensor(batch.state)
    actions = ActionTensor(batch.action)
    rewards = Tensor(batch.reward)
    masks = Tensor(batch.mask)
    values = value_net(Variable(states, volatile=True)).data
    """get advantage estimation from the trajectories"""
    advantages, returns = estimate_advantages(rewards, masks, values,
                                              args.gamma, args.tau, Tensor)
    """perform TRPO update"""
    trpo_step(policy_net, value_net, states, actions, returns, advantages,
              args.max_kl, args.damping, args.l2_reg)
コード例 #3
0
ファイル: trpo_gym.py プロジェクト: wh-forker/PyTorch-RL
def update_params(batch):
    states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
    rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device)
    masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
    with torch.no_grad():
        values = value_net(states)
    """get advantage estimation from the trajectories"""
    advantages, returns = estimate_advantages(rewards, masks, values,
                                              args.gamma, args.tau, device)
    """perform TRPO update"""
    trpo_step(policy_net, value_net, states, actions, returns, advantages,
              args.max_kl, args.damping, args.l2_reg)
コード例 #4
0
def update_params(batch):
    states = torch.from_numpy(np.stack(batch.state))
    actions = torch.from_numpy(np.stack(batch.action))
    rewards = torch.from_numpy(np.stack(batch.reward))
    masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64))
    if use_gpu:
        states, actions, rewards, masks = states.cuda(), actions.cuda(), rewards.cuda(), masks.cuda()
    values = value_net(Variable(states, volatile=True)).data

    """get advantage estimation from the trajectories"""
    advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, use_gpu)

    """perform TRPO update"""
    trpo_step(policy_net, value_net, states, actions, returns, advantages, args.max_kl, args.damping, args.l2_reg)