def calc_loss(batch, batch_weights, act_net, crt_net, tgt_act_net, tgt_crt_net, device='cpu'): states, actions, rewards, dones_mask, last_states = utils.unpack_batch( batch, device) batch_weights = torch.tensor(batch_weights).to(device) # critic loss crt_distr = crt_net(states, actions) last_act = tgt_act_net.target_model(last_states) last_distr = F.softmax(tgt_crt_net.target_model(last_states, last_act), dim=1) proj_distr = distr_projection(last_distr, rewards, dones_mask, gamma=GAMMA**REWARD_STEPS, device=device) prob_distr = -F.log_softmax(crt_distr, dim=1) * proj_distr critic_loss = prob_distr.sum(dim=1).mean() td_errors = prob_distr.sum(dim=1) * batch_weights # actor loss cur_actions = act_net(states) crt_distr = crt_net(states, cur_actions) actor_loss = -crt_net.distr_to_q(crt_distr) return actor_loss.mean(), critic_loss, td_errors + 1e-5
def calc_loss_prio(batch, batch_weights, _net, _target_net, gamma, _device="cpu"): states, actions, rewards, dones, next_states = utils.unpack_batch(batch) states_v = torch.tensor(states).to(_device) actions_v = torch.tensor(actions).to(_device) rewards_v = torch.tensor(rewards).to(_device) done_mask = torch.BoolTensor(dones).to(_device) batch_weights_v = torch.tensor(batch_weights).to(_device) state_action_values = _net(states_v).gather( 1, actions_v.unsqueeze(-1)).squeeze(-1) with torch.no_grad(): next_states_v = torch.tensor(next_states).to(_device) next_state_values = _target_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 expected_state_action_values = next_state_values.detach( ) * gamma + rewards_v losses_v = batch_weights_v * (state_action_values - expected_state_action_values)**2 return losses_v.mean(), (losses_v + 1e-5).data.cpu().numpy()
def data_func(_net, _device, _train_queue): envs = [make_env() for _ in range(NUM_ENVS)] agent = ptan.agent.PolicyAgent(lambda x: _net(x)[0], device=_device, apply_softmax=True) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS) micro_batch = [] for exp in exp_source: new_rewards = exp_source.pop_total_rewards() if new_rewards: data = TotalReward(reward=np.mean(new_rewards)) _train_queue.put(data) micro_batch.append(exp) if len(micro_batch) < MICRO_BATCH_SIZE: continue data = utils.unpack_batch(micro_batch, _net, _device=_device, last_val_gamma=GAMMA**REWARD_STEPS) _train_queue.put(data) micro_batch.clear()
def calc_loss(batch, _net, _target_net, gamma, _device="cpu"): states, actions, rewards, dones, next_states = utils.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(_device) actions_v = torch.tensor(actions).to(_device) next_states_v = torch.tensor(next_states).to(_device) next_distr_v, next_qvals_v = _target_net.both(next_states_v) next_acts = next_qvals_v.max(1)[1].data.cpu().numpy() next_distr = _target_net.apply_softmax(next_distr_v) next_distr = next_distr.data.cpu().numpy() next_best_distr = next_distr[range(batch_size), next_acts] dones = dones.astype(np.bool) proj_distr = dqn_extra.distr_projection(next_best_distr, rewards, dones, gamma) distr_v = _net(states_v) sa_vals = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(sa_vals, dim=1) proj_distr_v = torch.tensor(proj_distr).to(_device) loss_v = -state_log_sm_v * proj_distr_v return loss_v.sum(dim=1).mean()
def calc_loss_double_dqn(batch, _net, _target_net, gamma, _device="cpu", double=True): states, actions, rewards, dones, next_states = utils.unpack_batch(batch) states_v = torch.tensor(states).to(_device) actions_v = torch.tensor(actions).to(_device) rewards_v = torch.tensor(rewards).to(_device) done_mask = torch.BoolTensor(dones).to(_device) actions_v = actions_v.unsqueeze(-1) state_action_vals = net(states_v).gather(1, actions_v) state_action_vals = state_action_vals.squeeze(-1) with torch.no_grad(): next_states_v = torch.tensor(next_states).to(_device) if double: next_state_acts = net(next_states_v).max(1)[1] next_state_acts = next_state_acts.unsqueeze(-1) next_state_vals = _target_net(next_states_v).gather( 1, next_state_acts).squeeze(-1) else: next_state_vals = _target_net(next_states_v).max(1)[0] next_state_vals[done_mask] = 0.0 exp_sa_vals = next_state_vals.detach() * gamma + rewards_v return nn.MSELoss()(state_action_vals, exp_sa_vals)
def calc_double_dqn_loss(batch, net, tgt_net, gamma, device='cpu'): """ Loss function implementation of DeepMind paper. [Deep Reinforcement Learning with Double Q-Learning ([3] van Hasselt, Guez, and Silver, 2015)] """ states, actions, rewards, dones, last_states = utils.unpack_batch(batch) states_v = torch.tensor(states).to(device) rewards_v = torch.tensor(rewards).to(device) q_state_action_v = net(states_v)[range(len(actions)), actions] with torch.no_grad(): last_states_v = torch.tensor(last_states).to(device) next_actions_v = net(last_states_v).argmax(dim=1) next_q_state_action_v = tgt_net.target_model(last_states_v)[ range(len(actions)), next_actions_v] next_q_state_action_v[dones] = 0.0 exp_state_action_v = rewards_v + gamma * next_q_state_action_v return F.mse_loss(q_state_action_v, exp_state_action_v)
def main(): # some setup mp.set_start_method('spawn') # gym.logger.set_level(40) # writer timestr = time.strftime("%Y%m%d-%H%M%S") if LOAD_MODEL: name = f'runs/{NAME}_a3c_continued_{timestr}' else: name = f'runs/{NAME}_a3c_{timestr}' writer = SummaryWriter(name) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Using:', device) env = make_env() obs_shape = env.observation_space.shape print('Observation shape:', obs_shape) act_space = env.action_space.n print('Action space:', act_space) if LOAD_MODEL: net = torch.load(LOAD_MODEL) print('Model loaded from:', LOAD_MODEL) else: net = ModelA3C(obs_shape, act_space) net = net.to(device) env.close( ) # our env creates new actors that we don't need, we erase them here net.share_memory( ) # enabled by default for CUDA, but needs to be enabled explicitly for CPU optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3) train_queue = mp.Queue(maxsize=PROCESSES_COUNT) data_proc_list = [] for _ in range(PROCESSES_COUNT): data_proc = mp.Process(target=data_func, args=(net, device, train_queue)) data_proc.start() data_proc_list.append(data_proc) batch = [] time_step = 0 # add current hyperparameters to TensorBoard hparams = { 'gamma': GAMMA, 'lr': LEARNING_RATE, 'entropy_beta': ENTROPY_BETA, 'batch_size': BATCH_SIZE, 'steps_count': STEPS_COUNT } if DO_CLIP_GRAD: hparams['clip_grad_threshhold'] = CLIP_GRAD writer.add_hparams(hparams, {}) try: start_time = time.time() print(f'Training Started - {datetime.datetime.now()}') with tracking.RewardTracker(writer, stop_reward=REWARD_BOUNDRY) as tracker: with tracking.TBMeanTracker(writer, batch_size=100) as tb_tracker: while True: train_entry = train_queue.get() if isinstance(train_entry, TotalReward): if tracker.reward(train_entry.reward, time_step): break continue time_step += 1 batch.append(train_entry) if len(batch) < BATCH_SIZE: continue states_v, actions_t, vals_ref_v = unpack_batch( batch, net, last_val_gamma=GAMMA**STEPS_COUNT, device=device) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t] loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum( dim=1).mean() loss_v = entropy_loss_v + loss_value_v + loss_policy_v loss_v.backward() nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD) optimizer.step() tb_tracker.track("advantage", adv_v, time_step) tb_tracker.track("values", value_v, time_step) tb_tracker.track("batch_rewards", vals_ref_v, time_step) tb_tracker.track("loss_entropy", entropy_loss_v, time_step) tb_tracker.track("loss_policy", loss_policy_v, time_step) tb_tracker.track("loss_value", loss_value_v, time_step) tb_tracker.track("loss_total", loss_v, time_step) # save model when training ends print( f'\nConvergence reached! Solved in {round(time.time() - start_time, 3)} seconds' ) save_path = f'models/model_a3c_{timestr}.pt' torch.save(net.cpu(), save_path) print('Saved model to:', save_path) except KeyboardInterrupt: print('Stopped by the user') save_path = f'models/model_a3c_stopped_{timestr}.pt' torch.save(net.cpu(), save_path) print('Saved model to:', save_path) except Exception as e: print('Training Crushed:') traceback.print_exc() save_path = f'models/model_a3c_error_{timestr}.pt' torch.save(net.cpu(), save_path) print('Saved model to:', save_path) finally: # writer.flush() for p in data_proc_list: p.terminate() p.join() torch.cuda.empty_cache()
def grads_func(_proc_name, _net, _device, _train_queue): envs = [make_env() for _ in range(NUM_ENVS)] agent = ptan.agent.PolicyAgent(lambda x: _net(x)[0], device=_device, apply_softmax=True) exp_source = ptan.experience.ExperienceSourceFirstLast(envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS) _batch = [] frame_idx = 0 writer = SummaryWriter(comment=_proc_name) with utils.RewardTracker(writer, REWARD_BOUND) as tracker: with ptan.common.utils.TBMeanTracker(writer, 100) as tb_tracker: for exp in exp_source: frame_idx += 1 new_rewards = exp_source.pop_total_rewards() if new_rewards and tracker.reward(new_rewards[0], frame_idx): break _batch.append(exp) if len(_batch) < GRAD_BATCH: continue data = utils.unpack_batch(_batch, _net, device=_device, last_val_gamma=GAMMA ** REWARD_STEPS) states_v, actions_t, vals_ref_v = data _batch.clear() _net.zero_grad() logits_v, value_v = _net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() log_p_a = log_prob_v[range(GRAD_BATCH), actions_t] log_prob_actions_v = adv_v * log_p_a loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) ent = (prob_v * log_prob_v).sum(dim=1).mean() entropy_loss_v = ENTROPY_BETA * ent loss_v = entropy_loss_v + loss_value_v + loss_policy_v loss_v.backward() tb_tracker.track("advantage", adv_v, frame_idx) tb_tracker.track("values", value_v, frame_idx) tb_tracker.track("batch_rewards", vals_ref_v, frame_idx) tb_tracker.track("loss_entropy", entropy_loss_v, frame_idx) tb_tracker.track("loss_policy", loss_policy_v, frame_idx) tb_tracker.track("loss_value", loss_value_v, frame_idx) tb_tracker.track("loss_total", loss_v, frame_idx) nn_utils.clip_grad_norm_(_net.parameters(), CLIP_GRAD) grads = [ param.grad.data.cpu().numpy() if param.grad is not None else None for _param in _net.parameters() ] _train_queue.put(grads) _train_queue.put(None)
def main(): # some setup mp.set_start_method('spawn') gym.logger.set_level(40) # writer timestr = time.strftime("%Y%m%d-%H%M%S") if LOAD_MODEL: name = f'runs/{NAME}_a3c_continued_{timestr}' else: name = f'runs/{NAME}_a3c_{timestr}' writer = SummaryWriter() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Using:', device) env = make_env() obs_shape = env.observation_space.shape print('Observation shape:', obs_shape) act_space = env.action_space.shape print('Action space:', act_space) if LOAD_MODEL: net = torch.load(LOAD_MODEL) print('Model loaded from:', LOAD_MODEL) else: net = ModelA3C(obs_shape[0], act_space[0]) net = net.to(device) env.close() # our env creates new actors that we don't need, we erase them here test_env = make_env() # env to pass to the testing function net.share_memory() # enabled by default for CUDA, but needs to be enabled explicitly for CPU optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) train_queue = mp.Queue(maxsize=PROCESSES_COUNT) data_proc_list = [] for _ in range(PROCESSES_COUNT): data_proc = mp.Process(target=data_func, args=(net, device, train_queue)) data_proc.start() data_proc_list.append(data_proc) batch = [] best_reward = None time_step = 0 # add current hyperparameters to TensorBoard hparams = {'gamma': GAMMA, 'lr': LEARNING_RATE, 'entropy_beta': ENTROPY_BETA, 'batch_size': BATCH_SIZE, 'steps_count': STEPS_COUNT} writer.add_hparams(hparams, {}) try: start_time = time.time() print(f'Training Started - {datetime.datetime.now()}') with tracking.RewardTracker(writer) as tracker: with tracking.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: # Tracking train_entry = train_queue.get() if isinstance(train_entry, RewardSteps): rewards_steps = train_entry.reward rewards, steps = zip(*rewards_steps) tb_tracker.track('episode_steps', steps[0], time_step) tracker.reward(rewards[0], time_step) continue # wrong type, we don't want total rewards in our batch time_step += 1 # Testing and updating the best model if time_step % TEST_ITERS == 0: ts = time.time() rewards, steps = test_net(net, test_env, device=device) msg_str = "Test done in %.2f sec, reward %.3f, steps %d" % (time.time() - ts, rewards, steps) if best_reward is not None: msg_str += f' Current Best {round(best_reward, 3)}' print(msg_str) writer.add_scalar("test_reward", rewards, time_step) writer.add_scalar("test_steps", steps, time_step) if best_reward is None or best_reward < rewards: if best_reward is not None: print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) # name = "best_%+.3f_%d.dat" % (rewards, time_step) save_path = f'models/best_model_a3c_{timestr}.pt' # fname = os.path.join(save_path, name) torch.save(net, save_path) best_reward = rewards batch.append(train_entry) if len(batch) < BATCH_SIZE: continue # Training states_v, actions_v, vals_ref_v = unpack_batch(batch, net, last_val_gamma=GAMMA**STEPS_COUNT, device=device) batch.clear() # print('batch', states_v.shape, actions_v.shape, vals_ref_v.shape) optimizer.zero_grad() mu_v, var_v, value_v = net(states_v) # print('net', mu_v.shape, var_v.shape, value_v.shape) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) adv_v = vals_ref_v.unsqueeze(dim=-1) - value_v.detach() log_prob_v = adv_v * calc_logprob(mu_v, var_v, actions_v) # .unsqueeze(-1)) loss_policy_v = -log_prob_v.mean() entropy_loss_v = ENTROPY_BETA * (-(torch.log(2*math.pi*var_v) + 1)/2).mean() loss_v = loss_policy_v + entropy_loss_v + loss_value_v loss_v.backward() optimizer.step() tb_tracker.track("advantage", adv_v, time_step) tb_tracker.track("values", value_v, time_step) tb_tracker.track("batch_rewards", vals_ref_v, time_step) tb_tracker.track("loss_entropy", entropy_loss_v, time_step) tb_tracker.track("loss_policy", loss_policy_v, time_step) tb_tracker.track("loss_value", loss_value_v, time_step) tb_tracker.track("loss_total", loss_v, time_step) # save model when training ends print(f'\nConvergence reached! Solved in {round(time.time() - start_time, 3)} seconds') save_path = f'models/model_a3c_{timestr}.pt' torch.save(net.cpu(), save_path) print('Saved model to:', save_path) except KeyboardInterrupt: print('Stopped by the user') save_path = f'models/model_a3c_stopped_{timestr}.pt' torch.save(net.cpu(), save_path) print('Saved model to:', save_path) except Exception as e: print('Training Crushed:') traceback.print_exc() save_path = f'models/model_a3c_error_{timestr}.pt' torch.save(net.cpu(), save_path) print('Saved model to:', save_path) finally: # writer.flush() for p in data_proc_list: p.terminate() p.join() torch.cuda.empty_cache()