def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) model = DeepQNetwork() model_target = DeepQNetwork() if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) criterion = nn.MSELoss() game_state = FlappyBird() image, reward, terminal, score = game_state.next_frame(0) image = pre_processing( image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) image = torch.from_numpy(image) if torch.cuda.is_available(): model.cuda() model_target.cuda() image = image.cuda() state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :] model_target.eval() replay_memory = [] iter = 0 while iter < opt.num_iters: prediction = model(state)[0] # Exploration or exploitation epsilon = opt.final_epsilon + ( (opt.num_iters - iter) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters) u = random() random_action = u <= epsilon if random_action: #print("Perform a random action") action = randint(0, 1) else: action = torch.argmax(prediction).item() next_image, reward, terminal, score = game_state.next_frame(action) next_image = pre_processing( next_image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) next_image = torch.from_numpy(next_image) if torch.cuda.is_available(): next_image = next_image.cuda() next_state = torch.cat((state[0, 1:, :, :], next_image))[None, :, :, :] replay_memory.append([state, action, reward, next_state, terminal]) if len(replay_memory) > opt.replay_memory_size: del replay_memory[0] batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip( *batch) state_batch = torch.cat(tuple(state for state in state_batch)) action_batch = torch.from_numpy( np.array([[1, 0] if action == 0 else [0, 1] for action in action_batch], dtype=np.float32)) reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = torch.cat(tuple(state for state in next_state_batch)) if torch.cuda.is_available(): state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() current_prediction_batch = model(state_batch) next_prediction_batch = model_target(next_state_batch) y_batch = torch.cat( tuple(reward if terminal else reward + opt.gamma * prediction[max_action] for reward, terminal, prediction, max_action in zip( reward_batch, terminal_batch, next_prediction_batch, torch.argmax(model(next_state_batch), axis=1)))) q_value = torch.sum(current_prediction_batch * action_batch, dim=1) optimizer.zero_grad() # y_batch = y_batch.detach() loss = criterion(q_value, y_batch) loss.backward() optimizer.step() state = next_state if iter % opt.target_update_freq == 0: model_target.load_state_dict(model.state_dict()) iter += 1 if iter % 100 == 0: print( "Test::Double Q: Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}" .format(iter + 1, opt.num_iters, action, loss, epsilon, reward, torch.max(prediction))) writer.add_scalar('Train/Loss', loss, iter) writer.add_scalar('Train/Epsilon', epsilon, iter) writer.add_scalar('Train/Reward', reward, iter) writer.add_scalar('Train/Q-value', torch.max(prediction), iter) writer.add_scalar('Train/score', score, iter) if (iter + 1) % 1000000 == 0: torch.save(model, "{}/flappy_bird_{}".format(opt.saved_path, iter + 1)) torch.save(model, "{}/flappy_bird".format(opt.saved_path))
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size) model = DeepQNetwork() model_target = DeepQNetwork() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) criterion = nn.MSELoss() state = env.reset() if torch.cuda.is_available(): model.cuda() model_target.cuda() state = state.cuda() if opt.PER: replay_memory = Memory(capacity=opt.replay_memory_size) else: replay_memory = deque(maxlen=opt.replay_memory_size) epoch = 0 warmup_epoch = 0 while epoch < opt.num_epochs: next_steps = env.get_next_states() # Exploration or exploitation epsilon = opt.final_epsilon + ( max(opt.num_decay_epochs - epoch, 0) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs) u = random() random_action = u <= epsilon next_actions, next_states = zip(*next_steps.items()) next_states = torch.stack(next_states) if torch.cuda.is_available(): next_states = next_states.cuda() model.eval() with torch.no_grad(): predictions = model(next_states)[:, 0] model.train() if random_action: index = randint(0, len(next_steps) - 1) else: index = torch.argmax(predictions).item() next_state = next_states[index, :] action = next_actions[index] reward, done = env.step(action, render=True) if torch.cuda.is_available(): next_state = next_state.cuda() if opt.PER: experience = state, action, reward, next_state, done replay_memory.store(experience) else: replay_memory.append([state, reward, next_state, done]) if done: final_score = env.score final_tetrominoes = env.tetrominoes final_cleared_lines = env.cleared_lines state = env.reset() if torch.cuda.is_available(): state = state.cuda() else: state = next_state continue warmup_epoch += 1 if warmup_epoch < opt.learning_starts: continue epoch += 1 if opt.PER: tree_idx, batch = replay_memory.sample(opt.batch_size) else: batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, _, reward_batch, next_state_batch, done_batch = zip( *batch) state_batch = torch.stack(tuple(state for state in state_batch)) reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = torch.stack( tuple(state for state in next_state_batch)) if torch.cuda.is_available(): state_batch = state_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() q_values = model(state_batch) model_target.eval() with torch.no_grad(): next_prediction_batch = model_target(next_state_batch) model_target.train() y_batch = torch.cat( tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in zip( reward_batch, done_batch, next_prediction_batch)))[:, None] optimizer.zero_grad() loss = criterion(q_values, y_batch) loss.backward() optimizer.step() model.eval() model_target.eval() if opt.PER: with torch.no_grad(): if torch.cuda.is_available(): replay_memory.batch_update( tree_idx, np.abs(q_values.detach().cpu().numpy() - y_batch.cpu().numpy())) else: replay_memory.batch_update( tree_idx, np.abs(q_values.detach().numpy() - y_batch.numpy())) # Update target model <- model if epoch % opt.target_update_freq == 0: with torch.no_grad(): model_target.load_state_dict(model.state_dict()) model_target.train() model.eval() print( "Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}" .format(epoch, opt.num_epochs, action, final_score, final_tetrominoes, final_cleared_lines)) writer.add_scalar('Train/Score', final_score, epoch - 1) writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1) writer.add_scalar('Train/Cleared lines', final_cleared_lines, epoch - 1) if (epoch > 0 and epoch % opt.save_interval) == 0 or final_score >= 10000.0: torch.save(model, "{}/tetris_{}".format(opt.saved_path, epoch)) torch.save(model, "{}/tetris".format(opt.saved_path))