def update_params(batch_mgr, batch_wrk): states_mgr = torch.from_numpy(np.stack(batch_mgr.state)).to(dtype).to(device) directions = torch.from_numpy(np.stack(batch_mgr.action)).to(dtype).to(device) rewards_mgr = torch.from_numpy(np.stack(batch_mgr.reward)).to(dtype).to(device) masks_mgr = torch.from_numpy(np.stack(batch_mgr.mask)).to(dtype).to(device) states_wrk = torch.from_numpy(np.stack(batch_wrk.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch_wrk.action)).to(dtype).to(device) rewards_wrk = torch.from_numpy(np.stack(batch_wrk.reward)).to(dtype).to(device) masks_wrk = torch.from_numpy(np.stack(batch_wrk.mask)).to(dtype).to(device) with torch.no_grad(): values_mgr = value_mgr(states_mgr) values_wrk = value_wrk(states_wrk) """get advantage estimation from the trajectories""" advantages_mgr, returns_mgr = estimate_advantages(rewards_mgr, masks_mgr, values_mgr, args.gamma, args.tau, device) advantages_wrk, returns_wrk = estimate_advantages(rewards_wrk, masks_wrk, values_wrk, args.gamma, args.tau, device) #print (torch.sum(torch.isnan(advantages_mgr)*1.0), torch.sum(torch.isnan(returns_mgr)*1.0)) #print (torch.sum(torch.isnan(advantages_wrk)*1.0), torch.sum(torch.isnan(returns_wrk)*1.0)) """perform TRPO update""" policy_loss_m = 0 trpo_step(policy_mgr, value_mgr, states_mgr, directions, returns_mgr, advantages_mgr, args.max_kl, args.damping, args.l2_reg) trpo_step(policy_wrk, value_wrk, states_wrk, actions, returns_wrk, advantages_wrk, args.max_kl, args.damping, args.l2_reg)
def update_params(batch): states = Tensor(batch.state) actions = ActionTensor(batch.action) rewards = Tensor(batch.reward) masks = Tensor(batch.mask) values = value_net(Variable(states, volatile=True)).data """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, Tensor) """perform TRPO update""" trpo_step(policy_net, value_net, states, actions, returns, advantages, args.max_kl, args.damping, args.l2_reg)
def update_params(batch): states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(states) """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """perform TRPO update""" trpo_step(policy_net, value_net, states, actions, returns, advantages, args.max_kl, args.damping, args.l2_reg)
def update_params(batch): states = torch.from_numpy(np.stack(batch.state)) actions = torch.from_numpy(np.stack(batch.action)) rewards = torch.from_numpy(np.stack(batch.reward)) masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64)) if use_gpu: states, actions, rewards, masks = states.cuda(), actions.cuda(), rewards.cuda(), masks.cuda() values = value_net(Variable(states, volatile=True)).data """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, use_gpu) """perform TRPO update""" trpo_step(policy_net, value_net, states, actions, returns, advantages, args.max_kl, args.damping, args.l2_reg)