agent = agent_model.AgentDDPG(act_net, device=device) exp_source = Experience.ExperienceSourceFirstLast(env, agent, gamma=gamma, steps_count=1) buffer = Experience.ExperienceReplayBuffer(exp_source, buffer_size=replay_size) if args.optimizer and args.optimizer == "RMSprop": act_opt = optim.RMSprop(act_net.parameters(), lr=lr_actor) crt_opt = optim.RMSprop(crt_net.parameters(), lr=lr_critic) else: act_opt = optim.Adam(act_net.parameters(), lr=lr_actor) crt_opt = optim.Adam(crt_net.parameters(), lr=lr_critic) utils.load_agent_state(act_net, crt_net, [act_opt, crt_opt], path=ckpt_save_path) frame_idx = 0 drl_updates = 0 best_reward = None with utils.RewardTracker(writer) as tracker: with utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: frame_idx += 1 buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) mean_reward = tracker.reward(rewards[0], frame_idx) if mean_reward is not None and mean_reward > REWARD_TO_SOLVE: print("environment solved in % steps" % frame_idx, " (% episodes)" % len(tracker.total_rewards)) utils.save_agent_state(act_net, crt_net, [act_opt, crt_opt], frame_idx, len(tracker.total_rewards), path=ckpt_save_path) break
exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam( net.parameters(), lr=params['learning_rate']) # TODO: change to RMSprop utils.load_agent_state(net, optimizer, selector, load_optimizer=False, env_name='boxing', path='./agent_ckpt/agent_ls_dqn_-boxing.pth') frame_idx = 0 drl_updates = 0 with utils.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): if save_for_analysis: temp_model_name = model_name + "_" + str(frame_idx) utils.save_agent_state( net, optimizer, frame_idx,