Ejemplo n.º 1
0
            agent = agent_model.AgentDDPG(act_net, device=device)
        exp_source = Experience.ExperienceSourceFirstLast(env, agent, gamma=gamma, steps_count=1)
        buffer = Experience.ExperienceReplayBuffer(exp_source, buffer_size=replay_size)
        if args.optimizer and args.optimizer == "RMSprop":
            act_opt = optim.RMSprop(act_net.parameters(), lr=lr_actor)
            crt_opt = optim.RMSprop(crt_net.parameters(), lr=lr_critic)
        else:
            act_opt = optim.Adam(act_net.parameters(), lr=lr_actor)
            crt_opt = optim.Adam(crt_net.parameters(), lr=lr_critic)

        utils.load_agent_state(act_net, crt_net, [act_opt, crt_opt], path=ckpt_save_path)

        frame_idx = 0
        drl_updates = 0
        best_reward = None
        with utils.RewardTracker(writer) as tracker:
            with utils.TBMeanTracker(writer, batch_size=10) as tb_tracker:
                while True:
                    frame_idx += 1
                    buffer.populate(1)
                    rewards_steps = exp_source.pop_rewards_steps()
                    if rewards_steps:
                        rewards, steps = zip(*rewards_steps)
                        tb_tracker.track("episode_steps", steps[0], frame_idx)
                        mean_reward = tracker.reward(rewards[0], frame_idx)
                        if mean_reward is not None and mean_reward > REWARD_TO_SOLVE:
                            print("environment solved in % steps" % frame_idx,
                                  " (% episodes)" % len(tracker.total_rewards))
                            utils.save_agent_state(act_net, crt_net, [act_opt, crt_opt], frame_idx,
                                                   len(tracker.total_rewards), path=ckpt_save_path)
                            break
Ejemplo n.º 2
0
        exp_source, buffer_size=params['replay_size'])
    optimizer = optim.Adam(
        net.parameters(),
        lr=params['learning_rate'])  # TODO: change to RMSprop

    utils.load_agent_state(net,
                           optimizer,
                           selector,
                           load_optimizer=False,
                           env_name='boxing',
                           path='./agent_ckpt/agent_ls_dqn_-boxing.pth')

    frame_idx = 0
    drl_updates = 0

    with utils.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)
            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx,
                                         selector.epsilon):
                    if save_for_analysis:
                        temp_model_name = model_name + "_" + str(frame_idx)
                        utils.save_agent_state(
                            net,
                            optimizer,
                            frame_idx,