Q_target = reward + self.gamma * Q_target
        self.policy_optimizer.zero_grad()
        loss = F.mse_loss(Q_current.float(), Q_target.float())
        loss.backward()
        self.policy_optimizer.step()

if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    state_space = env.observation_space.shape[0]
    action_space = env.action_space.n # num_action env.action_space.shape[0]
    max_episode_steps = env._max_episode_steps
    policy = DQN(state_space, action_space)
    episode_reward = 0
    episode_timesteps = 0
    capacity = 100000
    use_replay_buffer = 1000
    batch_size = 32
    replay_buffer = ReplayBuffer(state_space, action_space, capacity)
    state, done = env.reset(), False
    for t in range(int(10000)):
        episode_timesteps += 1
        action = policy.select_action(state, t)
        #action = select_action(state)
        next_state, reward, done, _ = env.step(action)
        not_done = float(done) if episode_timesteps < max_episode_steps else 0
        episode_reward += reward
        replay_buffer.store(state, action, next_state, reward, not_done)
        state = next_state
        if replay_buffer.buffered(64):
            policy.train(replay_buffer)