next_state, reward, done, _ = env.step(action) #get next state replay_buffer.push(state, action, reward, next_state, done) #push actions resutls to buffer state = next_state episode_reward += reward if done: # reset game and state = env.reset() all_rewards.append((frame_idx, episode_reward)) episode_reward = 0 if len( replay_buffer ) > replay_initial: #if number of plays has reached the limit calculate loss and optimize update model loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer) optimizer.zero_grad() loss.backward() optimizer.step() losses.append((frame_idx, loss.data.cpu().numpy())) if frame_idx % 10000 == 0 and len( replay_buffer) <= replay_initial: #two ifs are just for printing print('#Frame: %d, preparing replay buffer' % frame_idx) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) if frame_idx % 50000 == 0: target_model.copy_from(model) #updates target model
replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward # if the game is over if done: state = env.reset() all_rewards.append( (frame_idx, episode_reward)) # record reward for that game episode_reward = 0 # reset # Once the replay buffer has filled up enough if len(replay_buffer) > replay_initial: loss = compute_td_loss( model, target_model, batch_size, gamma, replay_buffer) # calculate the loss for the state optimizer.zero_grad() # reset gradient values loss.backward() # backpropogate loss optimizer.step() # Updates weight values losses.append( (frame_idx, loss.data.cpu().numpy())) # hold loss in array if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) torch.save(model.state_dict(), "run11_start.pth") if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) filename = "run11_model" + str(frame_idx) + ".pth"
agent.epsilon = linear_decay(init_epsilon, final_epsilon, step, decay_steps) # play _, state = play_and_record(state, agent, env, exp_replay, timesteps_per_epoch) # train obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample( batch_size) loss = compute_td_loss(obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch, agent, target_network, device=device) loss.backward() #grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm) opt.step() opt.zero_grad() #if step % loss_freq == 0: #td_loss_history.append(loss.data.cpu().item()) #grad_norm_history.append(grad_norm) if step % refresh_target_network_freq == 0: # Load agent weights into target_network
def play_to_train(num_frames, policy_model, target_model, buffer): losses = [] all_rewards = [] mean_losses = [] mean_rewards = [] episode_reward = 0 state = env.reset() start_training = time.time() for frame_idx in range(1, num_frames + 1): epsilon = epsilon_by_frame(frame_idx) action = policy_model.act(state, epsilon) next_state, reward, done, _ = env.step(action) buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(buffer) > replay_initial: loss = compute_td_loss(policy_model, target_model, batch_size, gamma, buffer, device) optimizer.zero_grad() loss.backward() for param in policy_model.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() losses.append(loss.data.cpu().numpy()) if frame_idx % 10000 == 0 and len(buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) if frame_idx % 10000 == 0 and len(buffer) > replay_initial: mean_losses.append(np.mean(losses)) print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses))) mean_rewards.append(np.mean(all_rewards[-10:])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:])) # Update the target network, copying all weights and biases in DQN if frame_idx % target_update == 0: target_model.load_state_dict(policy_model.state_dict()) # Saving checkpoints after every million frames if frame_idx % 1000000 == 0: model_filename = "dqn_pong_model_%s" % (frame_idx) torch.save(policy_model.state_dict(), model_filename) end_training = time.time() print( f'Total training time - {(end_training - start_training) / 3600} hours' ) # Save all mean losses with open('mean_losses.npy', 'wb') as losses_file: np.save(losses_file, np.array(mean_losses)) # Save all mean rewards with open('mean_rewards.npy', 'wb') as rewards_file: np.save(rewards_file, np.array(mean_rewards)) # Save the final policy model torch.save(policy_model.state_dict(), "dqn_pong_model_final")
action = model.act(state, epsilon) # I write this next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: # game over, save state = env.reset() all_rewards.append((frame_idx, episode_reward)) episode_reward = 0 if len(replay_buffer) > replay_initial: # go past replay buffer loss = compute_td_loss( model, target_model, batch_size, gamma, replay_buffer ) # target model, use by our loss? stabilizes our model optimizer.zero_grad() loss.backward() optimizer.step() losses.append((frame_idx, loss.data.cpu().numpy())) if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) if frame_idx % 50000 == 0: target_model.copy_from(model)