Beispiel #1
0
        state, reward, is_done, info = env.step(action)

        with torch.no_grad():
            qs2 = model(torch.FloatTensor([state.flatten()]))[0]

        target = reward + 0.9 * qs2.amax()
        loss = (target - qs[action])**2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        log = DictConfig({"episode": episode})
        log.ep_loss = loss.item()

        cumulative_reward += reward
        log.cumulative_reward = cumulative_reward

        rewards.append(reward)
        if must_record:
            video_buffer.append(deepcopy(env.render("rgb_array")))
        if is_done:
            log.ep_mean_reward = float(np.mean(rewards))
            log.ep_length = len(rewards)
            if must_record:
                log = dict(log)
                log[f"video_ep{episode}_reward{reward}"] = wandb.Video(
                    _format_video(video_buffer), fps=4, format="gif")

        wandb.log(log)