Q_target = reward + self.gamma * Q_target self.policy_optimizer.zero_grad() loss = F.mse_loss(Q_current.float(), Q_target.float()) loss.backward() self.policy_optimizer.step() if __name__ == '__main__': env = gym.make('CartPole-v0') state_space = env.observation_space.shape[0] action_space = env.action_space.n # num_action env.action_space.shape[0] max_episode_steps = env._max_episode_steps policy = DQN(state_space, action_space) episode_reward = 0 episode_timesteps = 0 capacity = 100000 use_replay_buffer = 1000 batch_size = 32 replay_buffer = ReplayBuffer(state_space, action_space, capacity) state, done = env.reset(), False for t in range(int(10000)): episode_timesteps += 1 action = policy.select_action(state, t) #action = select_action(state) next_state, reward, done, _ = env.step(action) not_done = float(done) if episode_timesteps < max_episode_steps else 0 episode_reward += reward replay_buffer.store(state, action, next_state, reward, not_done) state = next_state if replay_buffer.buffered(64): policy.train(replay_buffer)