Esempio n. 1
0
        for e in range(n_epochs):
            # reset
            frame = 0
            loss = 0.0
            Q_max = 0.0
            total_reward = 0
            env.reset()
            state_t_1, reward_t, terminal = env.step(
                np.random.choice(env.enable_actions))

            while not terminal:
                state_t = state_t_1

                # execute action in environment
                if steps > warmup:
                    action_t = agent.select_action(state_t)
                else:
                    action_t = np.random.choice(env.enable_actions)

                # observe environment
                state_t_1, reward_t, terminal = env.step(action_t)
                total_reward += reward_t

                # store experience
                agent.store_experience(state_t, action_t, reward_t, state_t_1,
                                       terminal)
                print(agent.tmp_q_values, np.argmax(agent.tmp_q_values),
                      agent.enable_actions.index(action_t))

                # for log
                frame += 1