Exemple #1
0
                torch.tensor(np.array([done])).to(device).float()))

            state = next_state
            episode_rewards += reward
            if done:
                break
        rewards.append(episode_rewards)
        
        # Train the model if memory is sufficient
        if len(memory) + len(episode_transitions) >= min_buffer:
            for i in range(train_steps):
                loss = optimize(model, target, memory, episode_transitions, optimizer)
                losses.append(loss.item())
       
        memory.extend(episode_transitions)

        # Update target network every once in a while
        if episode % target_update == 0:
            target.load_state_dict(model.state_dict())
            target.eval()

        if episode % print_interval == 0 and episode > 0:
            print("[Episode {}] | avg rewards : {:.3f} | s.d. rewards: {:.3f} | avg loss : {:.10f} | buffer size : {} | epsilon : {:.1f}%".format(
                            episode, np.mean(rewards), np.std(rewards), np.mean(losses), len(memory), epsilon*100))
            rewards = []
            losses = []

        if episode % SAVE_INTERVAL == 0 and episode > 0:
            torch.save(model.state_dict(), SAVE_PATH)

            episode_rewards += reward
            if done:
                break
        rewards.append(episode_rewards)

        # Train the model if memory is sufficient
        if len(memory_success) >= min_buffer and len(
                memory_failure) >= min_buffer:
            for i in range(train_steps):
                loss = optimize(model, target, memory_success, memory_failure,
                                episode_transitions, optimizer)
                losses.append(loss.item())

            # Update target network every once in a while
            if episode % target_update == 0:
                target.load_state_dict(model.state_dict())
                target.eval()

        if episode_rewards > 0:
            memory_success.extend(episode_transitions)
        else:
            memory_failure.extend(episode_transitions)

        if episode % print_interval == 0 and episode > 0:
            print(
                "[Episode {}] | avg rewards : {:.3f} | s.d. rewards: {:.3f} | avg loss : {:.10f} | succ. buffer : {} | fail. buffer : {}"
                .format(episode, np.mean(rewards), np.std(rewards),
                        np.mean(losses), len(memory_success),
                        len(memory_failure)))
            rewards = []
            losses = []