Beispiel #1
0
        state, reward, is_done, info = env.step(action)

        with torch.no_grad():
            qs2 = model(torch.FloatTensor([state.flatten()]))[0]

        target = reward + 0.9 * qs2.amax()
        loss = (target - qs[action])**2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        log = DictConfig({"episode": episode})
        log.ep_loss = loss.item()

        cumulative_reward += reward
        log.cumulative_reward = cumulative_reward

        rewards.append(reward)
        if must_record:
            video_buffer.append(deepcopy(env.render("rgb_array")))
        if is_done:
            log.ep_mean_reward = float(np.mean(rewards))
            log.ep_length = len(rewards)
            if must_record:
                log = dict(log)
                log[f"video_ep{episode}_reward{reward}"] = wandb.Video(
                    _format_video(video_buffer), fps=4, format="gif")

        wandb.log(log)
Beispiel #2
0
        action = int(torch.multinomial(prob[0], num_samples=1)[0])

        state, reward, is_done, info = env.step(action)

        rewards.append(reward)
        probabilities.append(prob[0][action])
        if must_record:
            video_buffer.append(deepcopy(env.render("rgb_array")))

    rewards = torch.FloatTensor(rewards)
    probabilities = torch.stack(probabilities)
    loss_terms = -1 * rewards * probabilities

    loss = torch.mean(loss_terms)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    log = DictConfig({"episode": episode})
    log.ep_loss = loss.item()
    log.ep_mean_reward = rewards.mean().item()
    log.ep_length = len(rewards)

    if must_record:
        log = dict(log)
        log[f"video_ep{episode}_reward{reward}"] = wandb.Video(
            _format_video(video_buffer), fps=4, format="gif")

    wandb.log(log)