env_id = "PongNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env)
env = wrap_pytorch(env)

num_frames = 1000000
batch_size = 32
gamma = 0.99

replay_initial = 10000
replay_buffer = ReplayBuffer(100000)

model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)
model.load_state_dict(torch.load('trained_model.pth'))
model.eval()
if USE_CUDA:
    model = model.cuda()

epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 30000

losses = []
all_rewards = []
episode_reward = 0

loss_list = []
reward_list = []

state = env.reset()
gamma = 0.99
target_update = 50000
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 1000000
replay_initial = 10000
learning_rate = 1e-5
train_replay_buffer = ReplayBuffer(100000)
analysis_replay_buffer = ReplayBuffer(100000)

policy_model = QLearner(env, train_num_frames, batch_size, gamma,
                        train_replay_buffer)
target_model = QLearner(env, train_num_frames, batch_size, gamma,
                        train_replay_buffer)
target_model.load_state_dict(policy_model.state_dict())
target_model.eval()

optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if USE_CUDA:
    policy_model = policy_model.to(device)
    target_model = target_model.to(device)

epsilon_by_frame = lambda frame_idx: epsilon_final + (
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)


def play_to_train(num_frames, policy_model, target_model, buffer):
    losses = []
    all_rewards = []
    mean_losses = []