Example #1
0
    next_state, reward, done, _ = env.step(action)  #get next state
    replay_buffer.push(state, action, reward, next_state,
                       done)  #push actions resutls to buffer

    state = next_state
    episode_reward += reward

    if done:  # reset game and
        state = env.reset()
        all_rewards.append((frame_idx, episode_reward))
        episode_reward = 0

    if len(
            replay_buffer
    ) > replay_initial:  #if number of plays has reached the limit calculate loss and optimize update model
        loss = compute_td_loss(model, target_model, batch_size, gamma,
                               replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append((frame_idx, loss.data.cpu().numpy()))

    if frame_idx % 10000 == 0 and len(
            replay_buffer) <= replay_initial:  #two ifs are just for printing
        print('#Frame: %d, preparing replay buffer' % frame_idx)

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1]))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])

    if frame_idx % 50000 == 0:
        target_model.copy_from(model)  #updates target model
Example #2
0
    replay_buffer.push(state, action, reward, next_state, done)

    state = next_state
    episode_reward += reward

    # if the game is over
    if done:
        state = env.reset()
        all_rewards.append(
            (frame_idx, episode_reward))  # record reward for that game
        episode_reward = 0  # reset

    # Once the replay buffer has filled up enough
    if len(replay_buffer) > replay_initial:
        loss = compute_td_loss(
            model, target_model, batch_size, gamma,
            replay_buffer)  # calculate the loss for the state
        optimizer.zero_grad()  # reset gradient values
        loss.backward()  # backpropogate loss
        optimizer.step()  # Updates weight values
        losses.append(
            (frame_idx, loss.data.cpu().numpy()))  # hold loss in array

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)
        torch.save(model.state_dict(), "run11_start.pth")

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1]))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])
        filename = "run11_model" + str(frame_idx) + ".pth"
Example #3
0
        agent.epsilon = linear_decay(init_epsilon, final_epsilon, step,
                                     decay_steps)

        # play
        _, state = play_and_record(state, agent, env, exp_replay,
                                   timesteps_per_epoch)

        # train
        obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(
            batch_size)

        loss = compute_td_loss(obs_batch,
                               act_batch,
                               reward_batch,
                               next_obs_batch,
                               is_done_batch,
                               agent,
                               target_network,
                               device=device)

        loss.backward()
        #grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
        opt.step()
        opt.zero_grad()

        #if step % loss_freq == 0:
        #td_loss_history.append(loss.data.cpu().item())
        #grad_norm_history.append(grad_norm)

        if step % refresh_target_network_freq == 0:
            # Load agent weights into target_network
def play_to_train(num_frames, policy_model, target_model, buffer):
    losses = []
    all_rewards = []
    mean_losses = []
    mean_rewards = []
    episode_reward = 0
    state = env.reset()

    start_training = time.time()
    for frame_idx in range(1, num_frames + 1):
        epsilon = epsilon_by_frame(frame_idx)
        action = policy_model.act(state, epsilon)

        next_state, reward, done, _ = env.step(action)
        buffer.push(state, action, reward, next_state, done)

        state = next_state
        episode_reward += reward

        if done:
            state = env.reset()
            all_rewards.append(episode_reward)
            episode_reward = 0

        if len(buffer) > replay_initial:
            loss = compute_td_loss(policy_model, target_model, batch_size,
                                   gamma, buffer, device)
            optimizer.zero_grad()
            loss.backward()
            for param in policy_model.parameters():
                param.grad.data.clamp_(-1, 1)
            optimizer.step()
            losses.append(loss.data.cpu().numpy())

        if frame_idx % 10000 == 0 and len(buffer) <= replay_initial:
            print('#Frame: %d, preparing replay buffer' % frame_idx)

        if frame_idx % 10000 == 0 and len(buffer) > replay_initial:
            mean_losses.append(np.mean(losses))
            print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses)))
            mean_rewards.append(np.mean(all_rewards[-10:]))
            print('Last-10 average reward: %f' % np.mean(all_rewards[-10:]))

        # Update the target network, copying all weights and biases in DQN
        if frame_idx % target_update == 0:
            target_model.load_state_dict(policy_model.state_dict())

        # Saving checkpoints after every million frames
        if frame_idx % 1000000 == 0:
            model_filename = "dqn_pong_model_%s" % (frame_idx)
            torch.save(policy_model.state_dict(), model_filename)

    end_training = time.time()

    print(
        f'Total training time - {(end_training - start_training) / 3600} hours'
    )

    # Save all mean losses
    with open('mean_losses.npy', 'wb') as losses_file:
        np.save(losses_file, np.array(mean_losses))

    # Save all mean rewards
    with open('mean_rewards.npy', 'wb') as rewards_file:
        np.save(rewards_file, np.array(mean_rewards))

    # Save the final policy model
    torch.save(policy_model.state_dict(), "dqn_pong_model_final")
Example #5
0
    action = model.act(state, epsilon)  # I write this

    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)

    state = next_state
    episode_reward += reward

    if done:  # game over, save
        state = env.reset()
        all_rewards.append((frame_idx, episode_reward))
        episode_reward = 0

    if len(replay_buffer) > replay_initial:  # go past replay buffer
        loss = compute_td_loss(
            model, target_model, batch_size, gamma, replay_buffer
        )  # target model, use by our loss? stabilizes our model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append((frame_idx, loss.data.cpu().numpy()))

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1]))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])

    if frame_idx % 50000 == 0:
        target_model.copy_from(model)