def update_params(batch_mgr, batch_wrk): states_mgr = torch.from_numpy(np.stack(batch_mgr.state)).to(dtype).to(device) subgoals = torch.from_numpy(np.stack(batch_mgr.action)).to(dtype).to(device) rewards_mgr = torch.from_numpy(np.stack(batch_mgr.reward)).to(dtype).to(device) masks_mgr = torch.from_numpy(np.stack(batch_mgr.mask)).to(dtype).to(device) states_wrk = torch.from_numpy(np.stack(batch_wrk.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch_wrk.action)).to(dtype).to(device) rewards_wrk = torch.from_numpy(np.stack(batch_wrk.reward)).to(dtype).to(device) masks_wrk = torch.from_numpy(np.stack(batch_wrk.mask)).to(dtype).to(device) with torch.no_grad(): values_mgr = value_mgr(states_mgr) values_wrk = value_wrk(states_wrk) """get advantage estimation from the trajectories""" advantages_mgr, returns_mgr = estimate_advantages(rewards_mgr, masks_mgr, values_mgr, args.gamma, args.tau, device) advantages_wrk, returns_wrk = estimate_advantages(rewards_wrk, masks_wrk, values_wrk, args.gamma, args.tau, device) #print (torch.sum(torch.isnan(advantages_mgr)*1.0), torch.sum(torch.isnan(returns_mgr)*1.0)) #print (torch.sum(torch.isnan(advantages_wrk)*1.0), torch.sum(torch.isnan(returns_wrk)*1.0)) """perform TRPO update""" policy_loss_m = 0 policy_loss_m, value_loss_m = a2c_step(policy_mgr, value_mgr, optim_policy_m, optim_value_m, states_mgr, subgoals, returns_mgr, advantages_mgr, args.l2_reg) policy_loss_w, value_loss_w = a2c_step(policy_wrk, value_wrk, optim_policy_w, optim_value_w, states_wrk, actions, returns_wrk, advantages_wrk, args.l2_reg) return policy_loss_m, policy_loss_w
def update_params(batch): states = Tensor(batch.state) actions = ActionTensor(batch.action) rewards = Tensor(batch.reward) masks = Tensor(batch.mask) values = value_net(Variable(states, volatile=True)).data """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, Tensor) """perform TRPO update""" a2c_step(policy_net, value_net, optimizer_policy, optimizer_value, states, actions, returns, advantages, args.l2_reg)
def update_params(batch): states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(states) """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """perform TRPO update""" a2c_step(policy_net, value_net, optimizer_policy, optimizer_value, states, actions, returns, advantages, args.l2_reg)
def update_params(batch): states = torch.from_numpy(np.stack(batch.state)) actions = torch.from_numpy(np.stack(batch.action)) rewards = torch.from_numpy(np.stack(batch.reward)) masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64)) if use_gpu: states, actions, rewards, masks = states.cuda(), actions.cuda( ), rewards.cuda(), masks.cuda() values = value_net(Variable(states, volatile=True)).data """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, use_gpu) """perform TRPO update""" a2c_step(policy_net, value_net, optimizer_policy, optimizer_value, states, actions, returns, advantages, args.l2_reg)
def update_params(batch): states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(states) advantages, returns = estimate_advantages(rewards, masks, values, exp_args["config"]["gamma"], exp_args["config"]["tau"], device) a2c_step(policy_net, value_net, optimizer_policy, optimizer_value, states, actions, returns, advantages, exp_args["config"]["l2-reg"])
def update_params(batch_mgr, batch_wrk): states_mgr = torch.from_numpy(np.stack( batch_mgr.state)).to(dtype).to(device) directions = torch.from_numpy(np.stack( batch_mgr.action)).to(dtype).to(device) rewards_mgr = torch.from_numpy(np.stack( batch_mgr.reward)).to(dtype).to(device) masks_mgr = torch.from_numpy(np.stack(batch_mgr.mask)).to(dtype).to(device) states_wrk = torch.from_numpy(np.stack( batch_wrk.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch_wrk.action)).to(dtype).to(device) rewards_wrk = torch.from_numpy(np.stack( batch_wrk.reward)).to(dtype).to(device) masks_wrk = torch.from_numpy(np.stack(batch_wrk.mask)).to(dtype).to(device) with torch.no_grad(): values_mgr = value_mgr(states_mgr) fixed_logprobs_mgr = policy_mgr.get_log_prob(states_mgr, directions) values_wrk = value_wrk(states_wrk) fixed_logprobs_wrk = policy_wrk.get_log_prob(states_wrk, actions) """get advantage estimation from the trajectories""" advantages_mgr, returns_mgr = estimate_advantages(rewards_mgr, masks_mgr, values_mgr, args.gamma, args.tau, device) advantages_wrk, returns_wrk = estimate_advantages(rewards_wrk, masks_wrk, values_wrk, args.gamma, args.tau, device) #print (torch.sum(torch.isnan(advantages_mgr)*1.0), torch.sum(torch.isnan(returns_mgr)*1.0)) #print (torch.sum(torch.isnan(advantages_wrk)*1.0), torch.sum(torch.isnan(returns_wrk)*1.0)) """perform TRPO update""" #policy_loss_m, value_loss_m = a2c_step(policy_mgr, value_mgr, optim_policy_m, optim_value_m, states_mgr, directions, returns_mgr, advantages_mgr, args.l2_reg) policy_loss_w, value_loss_w = a2c_step(policy_wrk, value_wrk, optim_policy_w, optim_value_w, states_wrk, actions, returns_wrk, advantages_wrk, args.l2_reg) optim_iter_mgr = int(math.ceil(states_mgr.shape[0] / optim_batch_size)) #optim_iter_wrk = int(math.ceil(states_wrk.shape[0] / optim_batch_size)) for _ in range(optim_epochs): perm_mgr = np.arange(states_mgr.shape[0]) np.random.shuffle(perm_mgr) perm_mgr = LongTensor(perm_mgr).to(device) #perm_wrk = np.arange(states_wrk.shape[0]) #np.random.shuffle(perm_wrk) #perm_wrk = LongTensor(perm_wrk).to(device) states_mgr, directions, returns_mgr, advantages_mgr, fixed_logprobs_mgr = \ states_mgr[perm_mgr].clone(), directions[perm_mgr].clone(), returns_mgr[perm_mgr].clone(), advantages_mgr[perm_mgr].clone(), fixed_logprobs_mgr[perm_mgr].clone() #states_wrk, actions, returns_wrk, advantages_wrk, fixed_logprobs_wrk = \ # states_wrk[perm_wrk].clone(), actions[perm_wrk].clone(), returns_wrk[perm_wrk].clone(), advantages_wrk[perm_wrk].clone(), fixed_logprobs_wrk[perm_wrk].clone() for i in range(optim_iter_mgr): ind = slice(i * optim_batch_size, min((i + 1) * optim_batch_size, states_mgr.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states_mgr[ind], directions[ind], advantages_mgr[ind], returns_mgr[ind], fixed_logprobs_mgr[ind] ppo_step(policy_mgr, value_mgr, optim_policy_m, optim_value_m, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg)
def update_params(batch): for i in range(len(policy_net)): policy_net[i].train() value_net[i].train() # states = torch.from_numpy(np.stack(batch.state)) # actions = torch.from_numpy(np.stack(batch.action)) # rewards = torch.from_numpy(np.stack(batch.reward)) # masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64)) states = to_tensor_var(batch.state,True,"double").view(-1, agent.n_agents, agent.obs_shape_n[0]).data actions = to_tensor_var(batch.action,True,"long").view(-1, agent.n_agents, 1).data rewards = to_tensor_var(batch.reward,True,"double").view(-1, agent.n_agents, 1).data masks = to_tensor_var(batch.mask,True,"double").view(-1, agent.n_agents, 1).data whole_states_var = states.view(-1, agent.whole_critic_state_dim) whole_actions_var = actions.view(-1, agent.whole_critic_action_dim) # print( whole_states_var, whole_actions_var ) if use_gpu: states, actions, rewards, masks = states.cuda(), actions.cuda(), rewards.cuda(), masks.cuda() whole_states_var, whole_actions_var = whole_states_var.cuda(), whole_actions_var.cuda() # values = value_net(Variable(whole_states_var, volatile=True)).data values = [] for i in range(len(value_net)): # values.append(value_net[i](th.Tensor(whole_states_var)).data) # input = Variable(whole_states_var, volatile=True) values.append(value_net[i](Variable(whole_states_var))) # print(rewards, masks, values) # values = to_tensor_var(values,True,"double").view(-1, agent.n_agents, 1).data # Transpose! values_tmp = [[r[col] for r in values] for col in range(len(values[0]))] values = to_tensor_var(values_tmp,True,"double").view(-1, agent.n_agents,1 ).data.cuda() """get advantage estimation from the trajectories""" # advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, use_gpu) advantages, returns = [], [] for i in range(len(value_net)): adv, ret = estimate_advantages(rewards[:,i,:], masks[:,i,:], values[:,i,:], args.gamma, args.tau, use_gpu) advantages.append(adv) returns.append(ret) #print(advantages, returns) # Transpose! advantages = [[r[col] for r in advantages] for col in range(len(advantages[0]))] advantages = to_tensor_var(advantages,True,"double").view(-1, agent.n_agents,1 ).data.cuda() # Transpose! returns = [[r[col] for r in returns] for col in range(len(returns[0]))] returns = to_tensor_var(returns,True,"double").view(-1, agent.n_agents,1 ).data.cuda() # # combine n agent's related advantages together # tmp_ary = np.empty_like(advantages[0]) # for i in range(len(advantages)): # tmp_ary = np.hstack((tmp_ary, advantages[i])) # advantages = tmp_ary[:,1:len(value_net)+1] # tmp_ary = np.empty_like(returns[0]) # for i in range(len(returns)): # tmp_ary = np.hstack((tmp_ary, returns[i])) # returns = tmp_ary[:,1:len(value_net)+1] # advantages = to_tensor_var(advantages, True, "double").view(-1, agent.n_agents, 1).data.cuda() # returns = to_tensor_var(returns, True, "double").view(-1, agent.n_agents, 1).data.cuda() """perform TRPO update""" for i in range(len(value_net)): # a2c_step(policy_net[i], value_net[i], optimizer_policy[i], optimizer_value[i], states[:,i,:], actions[:,i,:], returns[:,i,:], advantages[:,i,:], args.l2_reg) a2c_step(policy_net[i], value_net[i], optimizer_policy[i], optimizer_value[i], states, actions, returns[:,i,:], advantages[:,i,:], args.l2_reg, i)