def draw_quantilles(frame_idx, batch, net, cuda=False, dir='.'): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = Variable(torch.from_numpy(states)) actions_v = Variable(torch.from_numpy(actions)) if cuda: states_v = states_v.cuda(async=True) actions_v = actions_v.cuda(async=True) quant_v = net(states_v)[range(batch_size), actions_v.data] quant = quant_v.data.cpu().numpy() for batch_idx in range(batch_size): if not dones[batch_idx]: continue q_val = np.mean(quant[batch_idx]) suffix = "_%03d_%06d_%d_%.1f_%.4f.png" % ( batch_idx, frame_idx, int(dones[batch_idx]), rewards[batch_idx], q_val) plt.clf() # plt.subplot(2, 1, 1) plt.plot(np.arange(0.0, 1.0, 1/QUANT_N), quant[batch_idx]) plt.title("Inv CDF, q_val=%.3f, done=%d, reward=%.1f" % ( q_val, int(dones[batch_idx]), rewards[batch_idx])) # plt.subplot(2, 1, 2) # plt.plot(1/np.diff(quant[batch_idx])/QUANT_N) # plt.title("Density") plt.savefig(os.path.join(dir, "quant" + suffix)) pass
def calc_loss_double_dqn(batch, net, tgt_net, gamma, device="cpu", double=True): states, actions, rewards, dones, next_states = \ common.unpack_batch(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) rewards_v = torch.tensor(rewards).to(device) done_mask = torch.ByteTensor(dones).to(device) actions_v = actions_v.unsqueeze(-1) state_action_vals = net(states_v).gather(1, actions_v) state_action_vals = state_action_vals.squeeze(-1) with torch.no_grad(): next_states_v = torch.tensor(next_states).to(device) if double: next_state_acts = net(next_states_v).max(1)[1] next_state_acts = next_state_acts.unsqueeze(-1) next_state_vals = tgt_net(next_states_v).gather( 1, next_state_acts).squeeze(-1) else: next_state_vals = tgt_net(next_states_v).max(1)[0] next_state_vals[done_mask] = 0.0 exp_sa_vals = next_state_vals.detach() * gamma + rewards_v return nn.MSELoss()(state_action_vals, exp_sa_vals)
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu", double=True): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = torch.tensor(states).to(device) next_states_v = torch.tensor(next_states).to(device) actions_v = torch.tensor(actions).to(device) rewards_v = torch.tensor(rewards).to(device) done_mask = torch.ByteTensor(dones).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) state_action_values = net(states_v).gather( 1, actions_v.unsqueeze(-1)).squeeze(-1) if double: next_state_actions = net(next_states_v).max(1)[1] next_state_values = tgt_net(next_states_v).gather( 1, next_state_actions.unsqueeze(-1)).squeeze(-1) else: next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 expected_state_action_values = next_state_values.detach( ) * gamma + rewards_v losses_v = batch_weights_v * (state_action_values - expected_state_action_values)**2 return losses_v.mean(), losses_v + 1e-5
def calc_loss(batch, net, tgt_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = \ common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) next_states_v = torch.tensor(next_states).to(device) # next state distribution next_distr_v, next_qvals_v = tgt_net.both(next_states_v) next_acts = next_qvals_v.max(1)[1].data.cpu().numpy() next_distr = tgt_net.apply_softmax(next_distr_v) next_distr = next_distr.data.cpu().numpy() next_best_distr = next_distr[range(batch_size), next_acts] dones = dones.astype(np.bool) proj_distr = dqn_extra.distr_projection(next_best_distr, rewards, dones, gamma) distr_v = net(states_v) sa_vals = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(sa_vals, dim=1) proj_distr_v = torch.tensor(proj_distr).to(device) loss_v = -state_log_sm_v * proj_distr_v return loss_v.sum(dim=1).mean()
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) next_states_v = torch.tensor(next_states).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) next_qvals_v = qvals_v[batch_size:] distr_v = distr_v[:batch_size] next_actions_v = next_qvals_v.max(1)[1] next_distr_v = tgt_net(next_states_v) next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) next_best_distr = next_best_distr_v.data.cpu().numpy() dones = dones.astype(np.bool) proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values, dim=1) proj_distr_v = torch.tensor(proj_distr) loss_v = -state_log_sm_v * proj_distr_v loss_v = batch_weights_v * loss_v.sum(dim=1) return loss_v.mean(), loss_v + 1e-5
def calc_loss(batch, batch_weights, net, tgt_net, gamma, cuda=False): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = Variable(torch.from_numpy(states)) next_states_v = Variable(torch.from_numpy(next_states), volatile=True) actions_v = Variable(torch.from_numpy(actions)) rewards_v = Variable(torch.from_numpy(rewards)) done_mask = torch.ByteTensor(dones) batch_weights_v = Variable(torch.from_numpy(batch_weights)) if cuda: states_v = states_v.cuda(async=True) next_states_v = next_states_v.cuda(async=True) actions_v = actions_v.cuda(async=True) rewards_v = rewards_v.cuda(async=True) done_mask = done_mask.cuda(async=True) batch_weights_v = batch_weights_v.cuda(async=True) state_all_action_values = net(states_v) state_action_values = state_all_action_values.gather(1, actions_v.unsqueeze(-1)).squeeze(-1) next_state_actions = net(next_states_v).max(1)[1] next_state_values = tgt_net(next_states_v).gather(1, next_state_actions.unsqueeze(-1)).squeeze(-1) next_state_values[done_mask] = 0.0 next_state_values.volatile = False # DQN Loss expected_state_action_values = next_state_values * gamma + rewards_v dq_losses = nn.SmoothL1Loss(reduce=False)(state_action_values, expected_state_action_values) dq_loss = (batch_weights_v * dq_losses).sum() last_dq_losses.append(dq_loss.data.cpu().numpy() / len(batch)) return dq_loss, (dq_losses.data.abs() + e_prio).cpu().numpy()
def data_func(net, device, train_queue): envs = [make_env() for _ in range(NUM_ENVS)] agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], device=device, apply_softmax=True) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS) micro_batch = [] for exp in exp_source: new_rewards = exp_source.pop_total_rewards() if new_rewards: data = TotalReward(reward=np.mean(new_rewards)) train_queue.put(data) micro_batch.append(exp) if len(micro_batch) < MICRO_BATCH_SIZE: continue data = common.unpack_batch(micro_batch, net, device=device, last_val_gamma=GAMMA**REWARD_STEPS) train_queue.put(data) micro_batch.clear()
def calc_loss(batch, net, tgt_net, gamma, cuda=False, double=True): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = Variable(torch.from_numpy(states)) next_states_v = Variable(torch.from_numpy(next_states), volatile=True) actions_v = Variable(torch.from_numpy(actions)) rewards_v = Variable(torch.from_numpy(rewards)) done_mask = torch.ByteTensor(dones) if cuda: states_v = states_v.cuda() next_states_v = next_states_v.cuda() actions_v = actions_v.cuda() rewards_v = rewards_v.cuda() done_mask = done_mask.cuda() state_action_values = net(states_v).gather( 1, actions_v.unsqueeze(-1)).squeeze(-1) if double: next_state_actions = net(next_states_v).max(1)[1] next_state_values = tgt_net(next_states_v).gather( 1, next_state_actions.unsqueeze(-1)).squeeze(-1) else: next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 next_state_values.volatile = False expected_state_action_values = next_state_values * gamma + rewards_v return nn.MSELoss()(state_action_values, expected_state_action_values)
def calc_loss(batch, net, tgt_net, gamma, device="cpu", save_prefix=None): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) next_states_v = torch.tensor(next_states).to(device) # next state distribution next_distr_v, next_qvals_v = tgt_net.both(next_states_v) next_actions = next_qvals_v.max(1)[1].data.cpu().numpy() next_distr = tgt_net.apply_softmax(next_distr_v).data.cpu().numpy() next_best_distr = next_distr[range(batch_size), next_actions] dones = dones.astype(np.bool) # project our distribution using Bellman update proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) # calculate net output distr_v = net(states_v) state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values, dim=1) proj_distr_v = torch.tensor(proj_distr).to(device) if save_prefix is not None: pred = F.softmax(state_action_values, dim=1).data.cpu().numpy() save_transition_images(batch_size, pred, proj_distr, next_best_distr, dones, rewards, save_prefix) loss_v = -state_log_sm_v * proj_distr_v return loss_v.sum(dim=1).mean()
def calc_loss(batch, batch_weights, net, tgt_net, gamma, cuda=False): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = Variable(torch.from_numpy(states)) next_states_v = Variable(torch.from_numpy(next_states), volatile=True) actions_v = Variable(torch.from_numpy(actions)) rewards_v = Variable(torch.from_numpy(rewards)) done_mask = torch.ByteTensor(dones) batch_weights_v = Variable(torch.from_numpy(batch_weights)) if cuda: states_v = states_v.cuda() next_states_v = next_states_v.cuda() actions_v = actions_v.cuda() rewards_v = rewards_v.cuda() done_mask = done_mask.cuda() batch_weights_v = batch_weights_v.cuda() state_action_values = net(states_v).gather( 1, actions_v.unsqueeze(-1)).squeeze(-1) next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 next_state_values.volatile = False expected_state_action_values = next_state_values * gamma + rewards_v losses_v = batch_weights_v * (state_action_values - expected_state_action_values)**2 return losses_v.mean(), losses_v + 1e-5
def calc_loss_rainbow(batch, batch_weights, net, tgt_net, gamma, device="cpu", double=True): states, actions, rewards, dones, next_states = \ common.unpack_batch(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) rewards_v = torch.tensor(rewards).to(device) done_mask = torch.ByteTensor(dones).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) actions_v = actions_v.unsqueeze(-1) state_action_values = net(states_v).gather(1, actions_v) state_action_values = state_action_values.squeeze(-1) with torch.no_grad(): next_states_v = torch.tensor(next_states).to(device) if double: next_state_actions = net(next_states_v).max(1)[1] next_state_actions = next_state_actions.unsqueeze(-1) next_state_values = tgt_net(next_states_v).gather( 1, next_state_actions).squeeze(-1) else: next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 expected_state_action_values = \ next_state_values.detach() * gamma + rewards_v losses_v = (state_action_values - expected_state_action_values)**2 losses_v *= batch_weights_v return losses_v.mean(), (losses_v + 1e-5).data.cpu().numpy()
def grads_func(proc_name, net, cuda, train_queue): envs = [make_env() for _ in range(NUM_ENVS)] agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], cuda=cuda, apply_softmax=True) exp_source = ptan.experience.ExperienceSourceFirstLast(envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS) batch = [] frame_idx = 0 writer = SummaryWriter(comment=proc_name) with common.RewardTracker(writer, stop_reward=REWARD_BOUND) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker: for exp in exp_source: frame_idx += 1 new_rewards = exp_source.pop_total_rewards() if new_rewards and tracker.reward(new_rewards[0], frame_idx): break batch.append(exp) if len(batch) < GRAD_BATCH: continue states_v, actions_t, vals_ref_v = \ common.unpack_batch(batch, net, last_val_gamma=GAMMA**REWARD_STEPS, cuda=cuda) batch.clear() net.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v, vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(GRAD_BATCH), actions_t] loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean() loss_v = entropy_loss_v + loss_value_v + loss_policy_v loss_v.backward() tb_tracker.track("advantage", adv_v, frame_idx) tb_tracker.track("values", value_v, frame_idx) tb_tracker.track("batch_rewards", vals_ref_v, frame_idx) tb_tracker.track("loss_entropy", entropy_loss_v, frame_idx) tb_tracker.track("loss_policy", loss_policy_v, frame_idx) tb_tracker.track("loss_value", loss_value_v, frame_idx) tb_tracker.track("loss_total", loss_v, frame_idx) # gather gradients nn_utils.clip_grad_norm(net.parameters(), CLIP_GRAD) grads = [param.grad.data.cpu().numpy() if param.grad is not None else None for param in net.parameters()] train_queue.put(grads) train_queue.put(None)
def calc_loss_qr(batch, net, tgt_net, gamma, cuda=False): states, actions, rewards, dones, next_states = common.unpack_batch(batch) done_indices = np.where(dones)[0] batch_size = len(batch) states_v = Variable(torch.from_numpy(states)) next_states_v = Variable(torch.from_numpy(next_states), volatile=True) actions_v = Variable(torch.from_numpy(actions)) rewards_v = Variable(torch.from_numpy(rewards)) tau_hat_v = Variable(torch.range(0.0, 1.0 - 1 / QUANT_N, 1 / QUANT_N)) + 0.5 / QUANT_N if cuda: states_v = states_v.cuda(async=True) next_states_v = next_states_v.cuda(async=True) actions_v = actions_v.cuda(async=True) rewards_v = rewards_v.cuda(async=True) tau_hat_v = tau_hat_v.cuda(async=True) next_quant_v = tgt_net(next_states_v) best_actions_v = tgt_net.qvals_from_quant(next_quant_v).max(1)[1] best_next_quant_v = next_quant_v[range(batch_size), best_actions_v.data] if dones.any(): done_indices_v = torch.from_numpy(done_indices) if cuda: done_indices_v = done_indices_v.cuda() best_next_quant_v[done_indices_v] = 0.0 best_next_quant_v.volatile = False expected_quant_v = best_next_quant_v * gamma + rewards_v.unsqueeze(-1) quant_v = net(states_v)[range(batch_size), actions_v.data] _, quant_idx = torch.sort(quant_v, dim=1, descending=False) tau = [] for idx in range(batch_size): tau.append(tau_hat_v[quant_idx[idx]]) tau_hat_v = torch.stack(tau) u = expected_quant_v - quant_v abs_u = u.abs() clamp_u = torch.clamp(abs_u, 0.0, HUBER_K) huber_loss = HUBER_K * (abs_u - clamp_u) + 0.5 * clamp_u**2 # mask_small_u = (abs_u <= HUBER_K).float() # huber_loss = mask_small_u * 0.5 * (u ** 2) # huber_loss = huber_loss + (1 - mask_small_u) * HUBER_K * (abs_u - HUBER_K / 2) huber_mul = torch.abs(tau_hat_v - (u < 0).float()) # huber_mul = tau_hat_v # huber_mul = 1 final_loss = huber_mul * huber_loss return final_loss.sum() / QUANT_N
def calc_loss_dqn(batch, net, tgt_net, gamma, device="cpu", cuda_async=False): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = torch.tensor(states).to(device, non_blocking=cuda_async) next_states_v = torch.tensor(next_states).to(device, non_blocking=cuda_async) actions_v = torch.tensor(actions).to(device, non_blocking=cuda_async) rewards_v = torch.tensor(rewards).to(device, non_blocking=cuda_async) done_mask = torch.BoolTensor(dones).to(device, non_blocking=cuda_async) state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 expected_state_action_values = next_state_values.detach() * gamma + rewards_v return nn.MSELoss()(state_action_values, expected_state_action_values)
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = torch.tensor(states).to(device) next_states_v = torch.tensor(next_states).to(device) actions_v = torch.tensor(actions).to(device) rewards_v = torch.tensor(rewards).to(device) done_mask = torch.ByteTensor(dones).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 expected_state_action_values = next_state_values.detach() * gamma + rewards_v losses_v = batch_weights_v * (state_action_values - expected_state_action_values) ** 2 return losses_v.mean(), losses_v + 1e-5
def calc_loss(batch, batch_weights, net, tgt_net, gamma, cuda=False): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = Variable(torch.from_numpy(states)) actions_v = Variable(torch.from_numpy(actions)) next_states_v = Variable(torch.from_numpy(next_states)) batch_weights_v = Variable(torch.from_numpy(batch_weights)) if cuda: states_v = states_v.cuda() actions_v = actions_v.cuda() next_states_v = next_states_v.cuda() batch_weights_v = batch_weights_v.cuda() # next state distribution # dueling arch -- actions from main net, distr from tgt_net # calc at once both next and cur states distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) next_qvals_v = qvals_v[batch_size:] distr_v = distr_v[:batch_size] next_actions_v = next_qvals_v.max(1)[1] next_distr_v = tgt_net(next_states_v) next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) next_best_distr = next_best_distr_v.data.cpu().numpy() dones = dones.astype(np.bool) # project our distribution using Bellman update proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) # calculate net output state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values) proj_distr_v = Variable(torch.from_numpy(proj_distr)) if cuda: proj_distr_v = proj_distr_v.cuda() loss_v = -state_log_sm_v * proj_distr_v loss_v = batch_weights_v * loss_v.sum(dim=1) return loss_v.mean(), loss_v + 1e-5
def calc_loss(batch, net, tgt_net, gamma, device="cpu", double=True): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = torch.tensor(states).to(device) next_states_v = torch.tensor(next_states).to(device) actions_v = torch.tensor(actions).to(device) rewards_v = torch.tensor(rewards).to(device) done_mask = torch.ByteTensor(dones).to(device) state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) if double: next_state_actions = net(next_states_v).max(1)[1] next_state_values = tgt_net(next_states_v).gather(1, next_state_actions.unsqueeze(-1)).squeeze(-1) else: next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 expected_state_action_values = next_state_values.detach() * gamma + rewards_v return nn.MSELoss()(state_action_values, expected_state_action_values)
def calc_loss(batch, batch_weights, net, target_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) rewards_v = torch.tensor(rewards).to(device) dones_mask_v = torch.ByteTensor(dones).to(device) next_states_v = torch.tensor(next_states).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) predicted_Q_v = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) next_state_Q_v = target_net(next_states_v).max(1)[0] # use the dones_mask to 0 out values where last_state is none next_state_Q_v[dones_mask_v] = 0.0 expected_Q_v = next_state_Q_v.detach() * gamma + rewards_v losses_v = batch_weights_v * (predicted_Q_v - expected_Q_v)**2 return losses_v.mean(), losses_v + 1e-5
def calc_loss_prio(batch, batch_weights, net, tgt_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) rewards_v = torch.tensor(rewards).to(device) done_mask = torch.BoolTensor(dones).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) state_action_values = net(states_v).gather( 1, actions_v.unsqueeze(-1)).squeeze(-1) with torch.no_grad(): next_states_v = torch.tensor(next_states).to(device) next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 expected_state_action_values = next_state_values.detach( ) * gamma + rewards_v losses_v = batch_weights_v * (state_action_values - expected_state_action_values)**2 return losses_v.mean(), (losses_v + 1e-5).data.cpu().numpy()
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = \ common.unpack_batch(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) rewards_v = torch.tensor(rewards).to(device) done_mask = torch.ByteTensor(dones).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) actions_v = actions_v.unsqueeze(-1) state_action_vals = net(states_v).gather(1, actions_v) state_action_vals = state_action_vals.squeeze(-1) with torch.no_grad(): next_states_v = torch.tensor(next_states).to(device) next_s_vals = tgt_net(next_states_v).max(1)[0] next_s_vals[done_mask] = 0.0 exp_sa_vals = next_s_vals.detach() * gamma + rewards_v l = (state_action_vals - exp_sa_vals)**2 losses_v = batch_weights_v * l return losses_v.mean(), \ (losses_v + 1e-5).data.cpu().numpy()
def calc_loss(batch, net, tgt_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) next_states_v = torch.tensor(next_states).to(device) # next state distribution # dueling arch -- actions from main net, distr from tgt_net # calc at once both next and cur states distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) next_qvals_v = qvals_v[batch_size:] distr_v = distr_v[:batch_size] next_actions_v = next_qvals_v.max(1)[1] next_distr_v = tgt_net(next_states_v) next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) next_best_distr = next_best_distr_v.data.cpu().numpy() dones = dones.astype(np.bool) # project our distribution using Bellman update proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) # calculate net output state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values, dim=1) proj_distr_v = torch.tensor(proj_distr).to(device) loss_v = -state_log_sm_v * proj_distr_v loss_v = loss_v.sum(dim=1) return loss_v.mean(), loss_v + 1e-5
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) next_states_v = torch.tensor(next_states).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) # next state distribution # dueling arch -- actions from main net, distr from tgt_net # calc at once both next and cur states distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) next_qvals_v = qvals_v[batch_size:] distr_v = distr_v[:batch_size] next_actions_v = next_qvals_v.max(1)[1] next_distr_v = tgt_net(next_states_v) next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) next_best_distr = next_best_distr_v.data.cpu().numpy() dones = dones.astype(np.bool) # project our distribution using Bellman update proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) # calculate net output state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values, dim=1) proj_distr_v = torch.tensor(proj_distr).to(device) loss_v = -state_log_sm_v * proj_distr_v loss_v = batch_weights_v * loss_v.sum(dim=1) return loss_v.mean(), loss_v + 1e-5
if step_idx > CUT_DEMO_PROB_FRAMES: DEMO_PROB = 0.01 if demo_samples and random.random() < DEMO_PROB: random.shuffle(demo_samples) demo_batch = demo_samples[:BATCH_SIZE] model_vnc.train_demo(net, optimizer, demo_batch, writer, step_idx, preprocessor=preprocessor, device=device) states_v, actions_t, vals_ref_v = \ common.unpack_batch(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS, device=device, states_preprocessor=preprocessor) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t] loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) entropy_loss_v = ENTROPY_BETA * (prob_v *
eta_current = buffer.get_eta(ETA_INIT, ETA_FINAL, ETA_BASELINE_EPOCH, ETA_AVG_SIZE) ck_list = buffer.get_cks(ep_len, eta_current) for k in range(ep_len): c_k = ck_list[k] if c_k < C_MIN: c_k = C_MIN if withPrio: batch, batch_indices, batch_weights = buffer.sample(c_k, BATCH_SIZE, beta) batch_weights_v = torch.from_numpy(batch_weights).to(device) else: batch = buffer.sample(c_k, BATCH_SIZE) batch_weights_v = torch.from_numpy(np.array(1, dtype=np.float32)).to(device) states_v, actions_v, ref_q_v = \ common.unpack_batch(batch, tgt_twinq_net.target_model, agent, GAMMA ** REWARD_STEPS, device, munchausen=MUNCHAUSEN) with torch.no_grad(): ref_q += ref_q_v.mean() # TwinQ q1_v, q2_v = twinq_net(states_v, actions_v) q1_loss_v = batch_weights_v * (q1_v.squeeze() - ref_q_v.detach()).pow(2) q2_loss_v = batch_weights_v * (q2_v.squeeze() - ref_q_v.detach()).pow(2) if withPrio: sample_prios_v = 0.5 * (q1_loss_v + q2_loss_v) + 1e-5 q1_loss_v = q1_loss_v.mean() q2_loss_v = q2_loss_v.mean() with torch.no_grad(): q1_loss += q1_loss_v q2_loss += q2_loss_v
torch.save( net.state_dict(), './checkpoints/' + args.name + "-best.dat") if finished: break continue step_idx += 1 # keep receiving data until one batch is full batch.append(train_entry) if len(batch) < BATCH_SIZE: continue # When a full batch, perform a policy update states_v, actions_t, q_vals_v = \ common.unpack_batch(batch, net, last_val_gamma=GAMMA**BELLMAN_STEPS, device=device) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), q_vals_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = q_vals_v - value_v.detach( ) # calculate advantage = Q(s,a) - V(s) log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t] loss_policy_v = -log_prob_actions_v.mean() # add an entropy bonus to the loss function, it is negative so will reduce loss
if mean_reward is not None: if best_reward is None or mean_reward > best_reward: if best_reward is not None: name = "best_%.3f_%d" % (mean_reward, step_idx) fname = os.path.join(saves_path, name) torch.save(net.state_dict(), fname + ".dat") preprocessor.save(fname + ".pre") print("Best reward updated: %.3f -> %.3f" % (best_reward, mean_reward)) best_reward = mean_reward batch.append(exp) if len(batch) < BATCH_SIZE: continue states_v, actions_t, vals_ref_v = \ common.unpack_batch(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS, cuda=args.cuda, states_preprocessor=preprocessor) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v, vals_ref_v) log_prob_v = F.log_softmax(logits_v) adv_v = vals_ref_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t] loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v) entropy_loss_v = ENTROPY_BETA * (prob_v *
frame_idx += 1 buffer.populate(1) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon) # Don't train while filling memory if len(buffer) < rep_init: continue if frame_idx % UPDATE_FREQ == 0: # Generate and unpack batch batch = buffer.sample(batch_size) s, a, r, t, s2 = common.unpack_batch(batch) a_1h = np.zeros((batch_size, env.action_space.n), dtype=np.float32) a_1h[np.arange(batch_size), a] = 1.0 # Train sess.run(train_step, feed_dict={state : s, action: a_1h, reward: r, done : t, state2: s2}) # Copy current network to target network if frame_idx % (UPDATE_FREQ * tgt_net_sync) == 0: sync_nets.run() # Save current network every 250,000 ATARI frames if frame_idx % 62500 == 0:
break continue #queue에 있는게 reward가 아닌 expsource객체 (에피소드가 끝남) step_idx += 1 batch.append(train_entry) #batch가 쌓일때까지 if len(batch) < BATCH_SIZE: continue #batch가 다 쌓이면 #gamma**4 states_v, actions_t, vals_ref_v = common.unpack_batch( batch, net, last_val_gamma=GAMMA**REWARD_STEPS, device=device) batch.clear() """최적화""" optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach()
batch.append(exp) if len(batch) < BATCH_SIZE: continue if step_idx > CUT_DEMO_PROB_FRAMES: DEMO_PROB = 0.01 if demo_samples and random.random() < DEMO_PROB: random.shuffle(demo_samples) demo_batch = demo_samples[:BATCH_SIZE] model_vnc.train_demo(net, optimizer, demo_batch, writer, step_idx, preprocessor=ptan.agent.default_states_preprocessor, cuda=args.cuda) states_v, actions_t, vals_ref_v = \ common.unpack_batch(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS, cuda=args.cuda) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v, vals_ref_v) log_prob_v = F.log_softmax(logits_v) adv_v = vals_ref_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t] loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v) entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean()
with common.RewardTracker(writer, stop_reward=REWARD_BOUND) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker: while True: train_entry = train_queue.get() if isinstance(train_entry, TotalReward): if tracker.reward(train_entry.reward, step_idx): break continue step_idx += 1 batch.append(train_entry) if len(batch) < BATCH_SIZE: continue states_v, actions_t, vals_ref_v = \ common.unpack_batch(batch, net, last_val_gamma=GAMMA**REWARD_STEPS, device=device) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t] loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean()
new_rewards = exp_source.pop_total_rewards() if new_rewards: finished, save_checkpoint = tracker.reward( new_rewards[0], step_idx) if save_checkpoint: torch.save(net.state_dict(), './checkpoints/' + args.name + "-best.dat") if finished: break if len(batch) < BATCH_SIZE: continue states_v, actions_t, q_vals_v = common.unpack_batch( batch, net, last_val_gamma=GAMMA**BELLMAN_STEPS, device=device) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), q_vals_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = q_vals_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t] loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1)
with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker: while True: train_entry = train_queue.get() if isinstance(train_entry, TotalReward): if tracker.reward(train_entry.reward, step_idx): break continue step_idx += 1 batch.append(train_entry) if len(batch) < BATCH_SIZE: continue states_v, actions_t, vals_ref_v = \ common.unpack_batch(batch, net, last_val_gamma=GAMMA**REWARD_STEPS, device=device) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t] loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(