state = env.reset() for frame_idx in range( 1, num_frames + 1 ): # QUESTION: Why is num_frames > replay_buffer capacity? replay_buffer should # overfill because num_frames is larger, so it will keep adding. Does it automatically expand when you push? I think it does expand, # using the numpy expand_dims funciton #print("Frame: " + str(frame_idx)) epsilon = epsilon_by_frame(frame_idx) # get the epsilon value action = model.act(state, epsilon) # This is where act function is used next_state, reward, done, _ = env.step( action) # look at next state to see if it gives us a reward replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward # if the game is over if done: state = env.reset() all_rewards.append( (frame_idx, episode_reward)) # record reward for that game episode_reward = 0 # reset # Once the replay buffer has filled up enough if len(replay_buffer) > replay_initial: loss = compute_td_loss( model, target_model, batch_size, gamma,
losses = [] all_rewards = [] episode_reward = 0 state = env.reset() # initial state for frame_idx in range(1, num_frames + 1): # plays until player or model gets score 21 #print("Frame: " + str(frame_idx)) #uncomment to look at frames epsilon = epsilon_by_frame(frame_idx) action = model.act(state, epsilon) #will write this function next_state, reward, done, _ = env.step(action) #get next state replay_buffer.push(state, action, reward, next_state, done) #push actions resutls to buffer state = next_state episode_reward += reward if done: # reset game and state = env.reset() all_rewards.append((frame_idx, episode_reward)) episode_reward = 0 if len( replay_buffer ) > replay_initial: #if number of plays has reached the limit calculate loss and optimize update model loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer) optimizer.zero_grad()
# Neg exp func. Start exploring then exploiting according to frame_indx epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) losses = [] all_rewards = [] episode_reward = 0 state = env.reset() # Initial state best_mean_reward = float('-inf') for frame_idx in range(starting_frame, num_frames + 1): # Each frame in # frames played epsilon = epsilon_by_frame(frame_idx) # Epsilon decreases as frames played action = model.act(state, epsilon) # if (rand < e) explore. Else action w max(Q-val). action: int next_state, reward, done, _ = env.step(action) # Get env info after taking action. next_state: 2d int. reward: float. done: bool. replay_buffer.push(state, action, reward, next_state, done) # Save state info onto buffer (note: every frame) state = next_state # Change to next state episode_reward += reward # Keep adding rewards until goal state if done: # Goal state state = env.reset() # Restart game all_rewards.append((frame_idx, episode_reward)) # Store episode_reward w frame it ended episode_reward = 0 if len(replay_buffer) > replay_initial: # If enough frames in replay_buffer (10000) loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer) optimizer.zero_grad() # Resets gradient after every mini-batch loss.backward() optimizer.step() losses.append((frame_idx, loss.data.cpu().numpy()))