Esempio n. 1
0
            while not terminal:
                state_t = state_t_1

                # execute action in environment
                if steps > warmup:
                    action_t = agent.select_action(state_t)
                else:
                    action_t = np.random.choice(env.enable_actions)

                # observe environment
                state_t_1, reward_t, terminal = env.step(action_t)
                total_reward += reward_t

                # store experience
                agent.store_experience(state_t, action_t, reward_t, state_t_1,
                                       terminal)
                print(agent.tmp_q_values, np.argmax(agent.tmp_q_values),
                      agent.enable_actions.index(action_t))

                # for log
                frame += 1
                steps += 1

                if steps > warmup:
                    loss += agent.current_loss
                    Q_max += np.max(agent.Q_values([state_t]))

            # experience replay
            # warmup中は学習しない
            if steps > warmup:
                agent.backword()