state, reward, is_done, info = env.step(action) with torch.no_grad(): qs2 = model(torch.FloatTensor([state.flatten()]))[0] target = reward + 0.9 * qs2.amax() loss = (target - qs[action])**2 optimizer.zero_grad() loss.backward() optimizer.step() log = DictConfig({"episode": episode}) log.ep_loss = loss.item() cumulative_reward += reward log.cumulative_reward = cumulative_reward rewards.append(reward) if must_record: video_buffer.append(deepcopy(env.render("rgb_array"))) if is_done: log.ep_mean_reward = float(np.mean(rewards)) log.ep_length = len(rewards) if must_record: log = dict(log) log[f"video_ep{episode}_reward{reward}"] = wandb.Video( _format_video(video_buffer), fps=4, format="gif") wandb.log(log)
action = int(torch.multinomial(prob[0], num_samples=1)[0]) state, reward, is_done, info = env.step(action) rewards.append(reward) probabilities.append(prob[0][action]) if must_record: video_buffer.append(deepcopy(env.render("rgb_array"))) rewards = torch.FloatTensor(rewards) probabilities = torch.stack(probabilities) loss_terms = -1 * rewards * probabilities loss = torch.mean(loss_terms) optimizer.zero_grad() loss.backward() optimizer.step() log = DictConfig({"episode": episode}) log.ep_loss = loss.item() log.ep_mean_reward = rewards.mean().item() log.ep_length = len(rewards) if must_record: log = dict(log) log[f"video_ep{episode}_reward{reward}"] = wandb.Video( _format_video(video_buffer), fps=4, format="gif") wandb.log(log)