def test(opt, model_dir): frame_number = 0 action_sum = 0 if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if torch.cuda.is_available(): model = torch.load("{}/flappy_bird_2000000".format(model_dir)) else: model = torch.load("{}/flappy_bird".format(opt.saved_path), map_location=lambda storage, loc: storage) model.eval() game_state = FlappyBird() image, reward, terminal, score = game_state.next_frame(0) image = pre_processing( image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) image = torch.from_numpy(image) if torch.cuda.is_available(): model.cuda() image = image.cuda() state = for _ in range(4)))[None, :, :, :] flag = True while flag: frame_number += 1 prediction = model(state)[0] action = torch.argmax(prediction).item() action_sum += action next_image, reward, terminal, score = game_state.next_frame(action) if terminal or score == 50: flag = False break next_image = pre_processing( next_image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) next_image = torch.from_numpy(next_image) if torch.cuda.is_available(): next_image = next_image.cuda() next_state =[0, 1:, :, :], next_image))[None, :, :, :] state = next_state return score, frame_number, action_sum
def test(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if torch.cuda.is_available(): model = torch.load("{}/flappy_bird".format(opt.saved_path)) else: model = torch.load("{}/flappy_bird".format(opt.saved_path), map_location=lambda storage, loc: storage) model.eval() game_state = FlappyBird() image, reward, terminal = game_state.next_frame(0) image = pre_processing( image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) image = torch.from_numpy(image) if torch.cuda.is_available(): model.cuda() image = image.cuda() state = for _ in range(4)))[None, :, :, :] while True: prediction = model(state)[0] action = torch.argmax(prediction)[0] next_image, reward, terminal = game_state.next_frame(action) next_image = pre_processing( next_image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) next_image = torch.from_numpy(next_image) if torch.cuda.is_available(): next_image = next_image.cuda() next_state =[0, 1:, :, :], next_image))[None, :, :, :] state = next_state
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) model = DeepQNetwork() model_target = DeepQNetwork() if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) criterion = nn.MSELoss() game_state = FlappyBird() image, reward, terminal, score = game_state.next_frame(0) image = pre_processing( image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) image = torch.from_numpy(image) if torch.cuda.is_available(): model.cuda() model_target.cuda() image = image.cuda() state = for _ in range(4)))[None, :, :, :] model_target.eval() replay_memory = [] iter = 0 while iter < opt.num_iters: prediction = model(state)[0] # Exploration or exploitation epsilon = opt.final_epsilon + ( (opt.num_iters - iter) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters) u = random() random_action = u <= epsilon if random_action: #print("Perform a random action") action = randint(0, 1) else: action = torch.argmax(prediction).item() next_image, reward, terminal, score = game_state.next_frame(action) next_image = pre_processing( next_image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) next_image = torch.from_numpy(next_image) if torch.cuda.is_available(): next_image = next_image.cuda() next_state =[0, 1:, :, :], next_image))[None, :, :, :] replay_memory.append([state, action, reward, next_state, terminal]) if len(replay_memory) > opt.replay_memory_size: del replay_memory[0] batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip( *batch) state_batch = for state in state_batch)) action_batch = torch.from_numpy( np.array([[1, 0] if action == 0 else [0, 1] for action in action_batch], dtype=np.float32)) reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = for state in next_state_batch)) if torch.cuda.is_available(): state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() current_prediction_batch = model(state_batch) next_prediction_batch = model_target(next_state_batch) y_batch = tuple(reward if terminal else reward + opt.gamma * prediction[max_action] for reward, terminal, prediction, max_action in zip( reward_batch, terminal_batch, next_prediction_batch, torch.argmax(model(next_state_batch), axis=1)))) q_value = torch.sum(current_prediction_batch * action_batch, dim=1) optimizer.zero_grad() # y_batch = y_batch.detach() loss = criterion(q_value, y_batch) loss.backward() optimizer.step() state = next_state if iter % opt.target_update_freq == 0: model_target.load_state_dict(model.state_dict()) iter += 1 if iter % 100 == 0: print( "Test::Double Q: Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}" .format(iter + 1, opt.num_iters, action, loss, epsilon, reward, torch.max(prediction))) writer.add_scalar('Train/Loss', loss, iter) writer.add_scalar('Train/Epsilon', epsilon, iter) writer.add_scalar('Train/Reward', reward, iter) writer.add_scalar('Train/Q-value', torch.max(prediction), iter) writer.add_scalar('Train/score', score, iter) if (iter + 1) % 1000000 == 0:, "{}/flappy_bird_{}".format(opt.saved_path, iter + 1)), "{}/flappy_bird".format(opt.saved_path))
def train(opt): # Set random seed if torch.cuda.is_available(): torch.cuda.manual_seed(opt.random_seed) else: torch.manual_seed(opt.random_seed) # Instantiate the model if opt.conv_dim is not None and \ opt.conv_kernel_sizes is not None and \ opt.conv_strides is not None and \ opt.fc_dim is not None: model = DeepQNetwork(opt.image_size, opt.image_size, conv_dim=opt.conv_dim, conv_kernel_sizes=opt.conv_kernel_sizes, conv_strides=opt.conv_strides, fc_dim=opt.fc_dim) else: model = DeepQNetwork(opt.image_size, opt.image_size) if opt.log_comet_ml: # Create a experiment experiment = Experiment(api_key=opt.comet_ml_api_key, project_name=opt.comet_ml_project_name, workspace=opt.comet_ml_workspace) experiment.log_other("iters_to_save", opt.iters_to_save) experiment.log_other("completed", False) experiment.log_other("random_seed", opt.random_seed) # Report hyperparameters to hyper_params = { "image_size": opt.image_size, "batch_size": opt.batch_size, "optimizer": opt.optimizer, "learning_rate":, "gamma": opt.gamma, "initial_epsilon": opt.initial_epsilon, "final_epsilon": opt.final_epsilon, "num_iters": opt.num_iters, "replay_memory_size": opt.replay_memory_size, "random_seed": opt.random_seed, "conv_dim": opt.conv_dim, "conv_kernel_sizes": opt.conv_kernel_sizes, "conv_strides": opt.conv_strides, "fc_dim": opt.fc_dim } experiment.log_parameters(hyper_params) optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) # Optimization algorithm criterion = nn.MSELoss() # Loss function game_state = FlappyBird() # Instantiate the Flappy Compass game image, reward, terminal = game_state.next_frame( 0 ) # Get the next image, along with its reward and an indication if it's a terminal state # Image preprocessing step (scaling, color removal and convertion to a PyTorch tensor) image = pre_processing( image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) image = torch.from_numpy(image) # Move the model and the current image data to the GPU, if available if torch.cuda.is_available(): model.cuda() image = image.cuda() # Prepare the state variable, which will host the last 4 frames state = for _ in range(4)))[None, :, :, :] # Initialize the replay memory, which saves sets of consecutive game states, the reward and terminal state indicator # so that the model can learn from them (essentially constitutes the training data, which grows with every new iteration) replay_memory = [] iter = 0 # Iteration counter # Main training loop performing the number of iterations specified by num_iters while iter < opt.num_iters: prediction = model(state)[0] # Get a prediction from the current state epsilon = opt.final_epsilon + ( (opt.num_iters - iter) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters ) # Set the decay of the probability of random actions u = random() random_action = u <= epsilon if random_action: print("Perform a random action") action = randint(0, 1) else: # Use the model's prediction to decide the next action action = torch.argmax(prediction).item() # Get a new frame and process it next_image, reward, terminal = game_state.next_frame(action) next_image = pre_processing( next_image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) next_image = torch.from_numpy(next_image) # Move the next image data to the GPU, if available if torch.cuda.is_available(): next_image = next_image.cuda() next_state = (state[0, 1:, :, :], next_image) )[None, :, :, :] # Prepare the next state variable, which will host the last 4 frames replay_memory.append( [state, action, reward, next_state, terminal] ) # Save the current state, action, next state and terminal state indicator in the replay memory if len(replay_memory) > opt.replay_memory_size: del replay_memory[ 0] # Delete the oldest reolay from memory if full capacity has been reached batch = sample(replay_memory, min(len(replay_memory), opt.batch_size) ) # Retrieve past play sequences from the replay memory state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip( *batch) state_batch = state for state in state_batch)) # States of the current batch action_batch = torch.from_numpy( np.array([[1, 0] if action == 0 else [0, 1] for action in action_batch], dtype=np.float32)) # Actions taken in the current batch reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) # Rewards in the current batch next_state_batch = state for state in next_state_batch)) # Next states of the current batch # Move batch data to the GPU, if available if torch.cuda.is_available(): state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() current_prediction_batch = model( state_batch ) # Predictions of the model for the replays of the current batch next_prediction_batch = model( next_state_batch ) # Next predictions of the model for the replays of the current batch # Set ground truth for the rewards for the current batch, considering whether the state is terminal or not y_batch = tuple(reward if terminal else reward + opt.gamma * torch.max(prediction) for reward, terminal, prediction in zip( reward_batch, terminal_batch, next_prediction_batch))) q_value = torch.sum( current_prediction_batch * action_batch, dim=1 ) # Predicted Q values (i.e. estimated return for each action) optimizer.zero_grad( ) # Reset the gradients to zero before a new optimization step loss = criterion(q_value, y_batch) # Calculate the loss loss.backward() # Backpropagation optimizer.step() # Weights optimization step state = next_state # Move to the next frame iter += 1 print( "Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}" .format(iter + 1, opt.num_iters, action, loss, epsilon, reward, torch.max(prediction))) if opt.log_comet_ml: # Log metrics to experiment.log_metric("train_loss", loss, step=iter) experiment.log_metric("train_epsilon", epsilon, step=iter) experiment.log_metric("train_reward", reward, step=iter) experiment.log_metric("train_Q_value", torch.max(prediction), step=iter) if (iter + 1) % opt.iters_to_save == 0: # Get the current day and time to attach to the saved model's name current_datetime ='%d_%m_%Y_%H_%M') # Set saved model name model_filename = f'{opt.saved_path}/flappy_compass_{current_datetime}_{iter+1}.pth' # Save model every iters_to_save iterations, model_filename) if opt.log_comet_ml and opt.comet_ml_save_model: # Upload model to experiment.log_asset(file_path=model_filename, overwrite=True) # Get the current day and time to attach to the saved model's name current_datetime ='%d_%m_%Y_%H_%M') # Set saved model name model_filename = f'{opt.saved_path}/flappy_compass_{current_datetime}_{iter+1}.pth' # Save the model after reaching the final iteration, model_filename) if opt.log_comet_ml: # Only report that the experiment completed successfully if it finished the training without errors experiment.log_other("completed", True) if opt.comet_ml_save_model: # Upload model to experiment.log_asset(file_path=model_filename, overwrite=True)