while True: frame_idx += 1 buffer.populate(1) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): if save_for_analysis: temp_model_name = model_name + "_" + str(frame_idx) utils.save_agent_state( net, optimizer, frame_idx, len(reward_tracker.total_rewards), selector.epsilon, save_replay=True, replay_buffer=buffer.buffer, name=temp_model_name) else: utils.save_agent_state( net, optimizer, frame_idx, len(reward_tracker.total_rewards), selector.epsilon, name='-boxing') break if len(buffer) < params['replay_initial']:
drl_updates = 0 best_reward = None with utils.RewardTracker(writer) as tracker: with utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: frame_idx += 1 buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) mean_reward = tracker.reward(rewards[0], frame_idx) if mean_reward is not None and mean_reward > REWARD_TO_SOLVE: print("environment solved in % steps" % frame_idx, " (% episodes)" % len(tracker.total_rewards)) utils.save_agent_state(act_net, crt_net, [act_opt, crt_opt], frame_idx, len(tracker.total_rewards), path=ckpt_save_path) break if len(buffer) < steps_to_start_learn: continue batch = buffer.sample(batch_size) states_v, actions_v, rewards_v, dones_mask, last_states_v = utils.unpack_batch(batch, device) # train critic crt_opt.zero_grad() q_v = crt_net(states_v, actions_v) last_act_v = tgt_act_net.target_model(last_states_v) q_last_v = tgt_crt_net.target_model(last_states_v, last_act_v) q_last_v[dones_mask] = 0.0 q_ref_v = rewards_v.unsqueeze(dim=-1) + q_last_v * gamma
if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): # if save_for_analysis: # temp_model_name = model_name + "_" + str(frame_idx) # utils.save_agent_state(net, optimizer, frame_idx, len(reward_tracker.total_rewards), # selector.epsilon, save_replay=True, # replay_buffer=buffer.buffer, # name=temp_model_name) # else: # utils.save_agent_state(net, optimizer, frame_idx, len(reward_tracker.total_rewards), # selector.epsilon, name='-boxing') utils.save_agent_state( net, optimizer, frame_idx, len(reward_tracker.total_rewards), selector.epsilon, path=model_saving_path) break if len(buffer) < params['replay_initial']: continue optimizer.zero_grad() batch = buffer.sample(params['batch_size']) loss_v = utils.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], device=device,