optimizer = optim.Adam(params=policy_net.parameters(), lr=lr) # Training Loop episode_durations = [] for episode in range(num_episodes): em.reset() state = em.get_state() for timestep in count(): action = agent.select_action(state, policy_net) reward = em.take_action(action) next_state = em.get_state() memory.push(Experience(state, action, next_state, reward)) state = next_state if memory.can_provide_sample(batch_size): experiences = memory.sample(batch_size) states, actions, rewards, next_states = extract_tensors( experiences) current_q_values = QValues.get_current(policy_net, states, actions) next_q_values = QValues.get_next(target_net, next_states) target_q_values = (next_q_values * gamma) + rewards loss = F.mse_loss(current_q_values, target_q_values.unsqueeze(1)) optimizer.zero_grad() loss.backward() optimizer.step() if em.done: episode_durations.append(timestep) plotter.plot(episode_durations, 100)