Ejemplo n.º 1
0
        while True:
            frame_idx += 1
            buffer.populate(1)
            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx,
                                         selector.epsilon):
                    if save_for_analysis:
                        temp_model_name = model_name + "_" + str(frame_idx)
                        utils.save_agent_state(
                            net,
                            optimizer,
                            frame_idx,
                            len(reward_tracker.total_rewards),
                            selector.epsilon,
                            save_replay=True,
                            replay_buffer=buffer.buffer,
                            name=temp_model_name)
                    else:
                        utils.save_agent_state(
                            net,
                            optimizer,
                            frame_idx,
                            len(reward_tracker.total_rewards),
                            selector.epsilon,
                            name='-boxing')
                    break

            if len(buffer) < params['replay_initial']:
Ejemplo n.º 2
0
        drl_updates = 0
        best_reward = None
        with utils.RewardTracker(writer) as tracker:
            with utils.TBMeanTracker(writer, batch_size=10) as tb_tracker:
                while True:
                    frame_idx += 1
                    buffer.populate(1)
                    rewards_steps = exp_source.pop_rewards_steps()
                    if rewards_steps:
                        rewards, steps = zip(*rewards_steps)
                        tb_tracker.track("episode_steps", steps[0], frame_idx)
                        mean_reward = tracker.reward(rewards[0], frame_idx)
                        if mean_reward is not None and mean_reward > REWARD_TO_SOLVE:
                            print("environment solved in % steps" % frame_idx,
                                  " (% episodes)" % len(tracker.total_rewards))
                            utils.save_agent_state(act_net, crt_net, [act_opt, crt_opt], frame_idx,
                                                   len(tracker.total_rewards), path=ckpt_save_path)
                            break

                    if len(buffer) < steps_to_start_learn:
                        continue

                    batch = buffer.sample(batch_size)
                    states_v, actions_v, rewards_v, dones_mask, last_states_v = utils.unpack_batch(batch, device)

                    # train critic
                    crt_opt.zero_grad()
                    q_v = crt_net(states_v, actions_v)
                    last_act_v = tgt_act_net.target_model(last_states_v)
                    q_last_v = tgt_crt_net.target_model(last_states_v, last_act_v)
                    q_last_v[dones_mask] = 0.0
                    q_ref_v = rewards_v.unsqueeze(dim=-1) + q_last_v * gamma
Ejemplo n.º 3
0
                if new_rewards:
                    if reward_tracker.reward(new_rewards[0], frame_idx,
                                             selector.epsilon):
                        # if save_for_analysis:
                        #     temp_model_name = model_name + "_" + str(frame_idx)
                        #     utils.save_agent_state(net, optimizer, frame_idx, len(reward_tracker.total_rewards),
                        #                            selector.epsilon, save_replay=True,
                        #                            replay_buffer=buffer.buffer,
                        #                            name=temp_model_name)
                        # else:
                        #     utils.save_agent_state(net, optimizer, frame_idx, len(reward_tracker.total_rewards),
                        #                            selector.epsilon, name='-boxing')
                        utils.save_agent_state(
                            net,
                            optimizer,
                            frame_idx,
                            len(reward_tracker.total_rewards),
                            selector.epsilon,
                            path=model_saving_path)
                        break

                if len(buffer) < params['replay_initial']:
                    continue

                optimizer.zero_grad()
                batch = buffer.sample(params['batch_size'])
                loss_v = utils.calc_loss_dqn(batch,
                                             net,
                                             tgt_net.target_model,
                                             gamma=params['gamma'],
                                             device=device,