# Computing actions by using FDDR delta = fddr(fragments, running_mean=mean, running_var=var).double().squeeze(-1) # Computing reward pad_delta = F.pad(delta, [1, 0]) delta_diff = (pad_delta[:, 1:] - pad_delta[:, :-1]) reward = torch.sum(delta * returns - c * torch.abs(delta_diff)) test_reward_meter.append(reward.item()) progress_bar.set_description( '[Epoch %d][Iteration %d][Reward: train = %.4f, test = %.4f]' % (e, i, train_reward_meter.get_average(-1), test_reward_meter.get_average(-1))) if e % save_per_epoch == 0: torch.save(fddr.state_dict(), os.path.join(log_src, 'fddrl.pkl')) train_reward_meter.step() test_reward_meter.step() # Save the model and reward history torch.save(fddr.state_dict(), os.path.join(log_src, 'fddrl.pkl')) np.save(os.path.join(log_src, 'fddrl_reward.npy'), train_reward_meter.get_average()) # Plot the reward curve plt.plot(train_reward_meter.get_average()) plt.plot(test_reward_meter.get_average()) plt.show()
# Computing actions by using FDDR delta = drl(fragments).double().squeeze(-1) # Computing reward pad_delta = F.pad(delta, [1, 0]) delta_diff = (pad_delta[:, 1:] - pad_delta[:, :-1]) reward = torch.sum(delta * returns - c * torch.abs(delta_diff)) # Updating FDDR optimizer.zero_grad() (-reward).backward() optimizer.step() # Recording and showing the information reward_meter.append(reward.item()) progress_bar.set_description( '[Epoch %d][Iteration %d][Reward: %.4f]' % (e, i, reward_meter.get_average(-1))) progress_bar.update() if e % save_per_epoch == 0: torch.save(drl.state_dict(), os.path.join(log_src, 'drl.pkl')) reward_meter.step() # Save the model and reward history torch.save(drl.state_dict(), os.path.join(log_src, 'drl.pkl')) np.save(os.path.join(log_src, 'drl_reward.npy'), reward_meter.get_average()) # Plot the reward curve plt.plot(reward_meter.get_average()) plt.show()