def play_against_other_players(checkpoint_folder, model_class, other_player_classes, runs, summary_writer): generations = [int(f[:8]) for f in listdir(checkpoint_folder) if f.endswith(".pt")] max_gen = max(generations) policy = model_class() policy.to(device=Settings.device) policy.load_state_dict(torch.load(checkpoint_folder + "/" + str(max_gen).zfill(8) + ".pt")) for other_player_class in other_player_classes: players = [other_player_class(), RlPlayer(policy), other_player_class(), RlPlayer(policy)] schafkopf_env = SchafkopfEnv(1) all_rewards = np.array([0., 0., 0., 0.]) for j in range(runs): state, reward, terminal = schafkopf_env.reset() while not terminal: action, prob = players[state["game_state"].current_player].act(state) state, reward, terminal = schafkopf_env.step(action, prob) all_rewards += reward all_rewards = all_rewards[[1, 0, 3, 2]] players = [RlPlayer(policy), other_player_class(), RlPlayer(policy), other_player_class()] schafkopf_env = SchafkopfEnv(1) for j in range(runs): state, reward, terminal = schafkopf_env.reset() while not terminal: action, prob = players[state["game_state"].current_player].act(state) state, reward, terminal = schafkopf_env.step(action, prob) all_rewards += reward summary_writer.add_scalar('Evaluation/' + str(other_player_class.__name__), (all_rewards[0] + all_rewards[2]) / (4 * runs), max_gen)
def main(): pimc_player = PIMCPlayer(10, 40, RandomPlayer()) policy = ActorCriticNetworkLSTM().to(Settings.device) policy.load_state_dict(torch.load("../policies/pretrained/lstm-policy.pt")) rl_player = RlPlayer(policy, action_shaping=False, eval=True) hp = HandPredictor().to(Settings.device) hp.load_state_dict(torch.load("../policies/pretrained/hand-predictor.pt")) smart_pimc_player = HPPIMCPlayer(10, 40, RandomPlayer(), HandPredictor().to(Settings.device)) ip = ImmitationPolicy().to(Settings.device) ip.load_state_dict(torch.load("../policies/00010340.pt")) immitation_player = RlPlayer(ip, action_shaping=False, eval=True) participants = [ rl_player, immitation_player, smart_pimc_player, pimc_player, RuleBasedPlayer(), RandomCowardPlayer(), RandomPlayer(), ] number_of_games = 1000 for i in range(len(participants)): for j in range(i + 1, len(participants)): p1 = participants[i] p2 = participants[j] cummulative_reward = [0, 0, 0, 0] for k in range( 2 ): #run the same tournament twice with differen positions of players print(' ') schafkopf_env = SchafkopfEnv(seed=1) if k == 0: players = [p1, p1, p2, p2] else: players = [p2, p2, p1, p1] cummulative_reward.reverse() # tournament loop for game_nr in range(1, number_of_games + 1): state, reward, terminal = schafkopf_env.reset() while not terminal: action, prob = players[ state["game_state"].current_player].act(state) state, reward, terminal = schafkopf_env.step( action, prob) cummulative_reward = [ cummulative_reward[m] + reward[m] for m in range(4) ] if game_nr % 100 == 0: print('.', end='') #schafkopf_env.print_game() print("player " + str(i) + " vs. player " + str(j) + " = " + str((cummulative_reward[2] + cummulative_reward[3]) / (2 * 2 * number_of_games)) + " to " + str((cummulative_reward[0] + cummulative_reward[1]) / (2 * 2 * number_of_games)))
def main(): print("Cuda available: "+str(torch.cuda.is_available())) #start tensorboard tb = program.TensorBoard() tb.configure(argv=[None, '--logdir', Settings.runs_folder]) tb.launch() # set seed for debugging if Settings.random_seed: torch.manual_seed(Settings.random_seed) #loading initial policy policy = Settings.model().to(Settings.device) # take the newest generation available i_episode = max_gen = 0 generations = [int(f[:8]) for f in listdir(Settings.checkpoint_folder) if f.endswith(".pt")] if len(generations) > 0: max_gen = max(generations) policy.load_state_dict(torch.load(Settings.checkpoint_folder+"/" + str(max_gen).zfill(8) + ".pt")) i_episode = max_gen #create ppo ppo = PPO(policy, [Settings.lr, Settings.lr_stepsize, Settings.lr_gamma], Settings.betas, Settings.gamma, Settings.K_epochs, Settings.eps_clip, Settings.batch_size,Settings.mini_batch_size, c1=Settings.c1, c2=Settings.c2, start_episode=max_gen-1 ) #create four players players = [RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old)] #create a game simulation schafkopf_env = SchafkopfEnv(Settings.random_seed) game_statistics = GameStatistics() # training loop for _ in range(0, 90000000): Settings.logger.info("playing " +str(Settings.update_games)+ " games") # play a bunch of games t0 = time.time() for _ in range(Settings.update_games): state, reward, terminal = schafkopf_env.reset() while not terminal: action, prob = players[state["game_state"].current_player].act(state) state, reward, terminal = schafkopf_env.step(action, prob) for p in range(4): players[p].retrieve_reward(reward[p]) i_episode += 1 game_statistics.update_statistics(state["game_state"], reward) t1 = time.time() #update the policy Settings.logger.info("updating policy") player_memories = Memory() for p in players: player_memories.append_memory(p.memory) ppo.update(player_memories, i_episode) t2 = time.time() ppo.lr_scheduler.step(i_episode) # writing game statistics for tensorboard Settings.logger.info("Episode: "+str(i_episode) + " game simulation (s) = "+str(t1-t0) + " update (s) = "+str(t2-t1)) schafkopf_env.print_game() game_statistics.write_and_reset (i_episode) # reset memories and replace policy players = [RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old)] # save and evaluate the policy Settings.logger.info("Saving Checkpoint") torch.save(ppo.policy_old.state_dict(), Settings.checkpoint_folder + "/" + str(i_episode).zfill(8) + ".pt") Settings.logger.info("Evaluation") play_against_other_players(Settings.checkpoint_folder, Settings.model, [RandomPlayer, RandomCowardPlayer, RuleBasedPlayer], Settings.eval_games, Settings.summary_writer)
def main(): print("Cuda available: "+str(torch.cuda.is_available())) #start tensorboard tb = program.TensorBoard() tb.configure(argv=[None, '--logdir', Settings.runs_folder]) tb.launch() # set seed for debugging if Settings.random_seed: torch.manual_seed(Settings.random_seed) #loading initial policy hand_predictor = HandPredictor().to(Settings.device) # take the newest generation available i_episode = max_gen = 0 generations = [int(f[:8]) for f in listdir(Settings.checkpoint_folder) if f.endswith(".pt")] if len(generations) > 0: max_gen = max(generations) hand_predictor.load_state_dict(torch.load(Settings.checkpoint_folder+"/" + str(max_gen).zfill(8) + ".pt")) i_episode = max_gen optimizer = torch.optim.Adam(hand_predictor.parameters(),lr=Settings.lr, betas=Settings.betas, weight_decay=Settings.optimizer_weight_decay) # training loop for _ in range(0, 90000000): Settings.logger.info("playing " +str(Settings.update_games)+ " games") smart_mcts_player = HPPIMCPlayer(30, 120, RandomPlayer(), hand_predictor) # create four players players = [smart_mcts_player, smart_mcts_player, smart_mcts_player, smart_mcts_player] # create a game simulation schafkopf_env = SchafkopfEnv(Settings.random_seed) game_statistics = GameStatistics() memory_states = [] memory_player_hands = [] # play a bunch of games t0 = time.time() for _ in range(Settings.update_games): state, reward, terminal = schafkopf_env.reset() while not terminal: memory_states.append(hand_predictor.preprocess(state)) #TODO: happens twice now and could be optimized memory_player_hands.append(hand_predictor.encode_player_hands(schafkopf_env.player_cards, state["game_state"].current_player)) action, prob = players[state["game_state"].current_player].act(state) state, reward, terminal = schafkopf_env.step(action, prob) if state["game_state"].game_type[1] == 2: schafkopf_env.print_game() print("game "+str(i_episode)) i_episode += 1 game_statistics.update_statistics(state["game_state"], reward) t1 = time.time() #update the policy Settings.logger.info("updating policy") # Create dataset from collected experiences dataset = PredictionDatasetLSTM(memory_states, memory_player_hands) training_generator = data.DataLoader(dataset, collate_fn=dataset.custom_collate,batch_size=Settings.mini_batch_size, shuffle=True) #logging avg_loss = 0 count = 0 hand_predictor.train() for epoch in range(Settings.K_epochs): # epoch mini_batches_in_batch = int(Settings.batch_size / Settings.mini_batch_size) optimizer.zero_grad() for i, (states, hands) in enumerate(training_generator): # mini batch # Transfer to GPU states = [state.to(Settings.device) for state in states] hands = hands.to(Settings.device) pred = hand_predictor(states) #loss = nn.MSELoss()(pred, hands) #TODO: replace by cross entropy loss = nn.BCELoss()(pred, hands) avg_loss += loss.mean().item() count +=1 loss.mean().backward() if (i + 1) % mini_batches_in_batch == 0: optimizer.step() optimizer.zero_grad() t2 = time.time() hand_predictor.eval() # writing game statistics for tensorboard Settings.logger.info("Episode: "+str(i_episode) + " game simulation (s) = "+str(t1-t0) + " update (s) = "+str(t2-t1)) schafkopf_env.print_game() game_statistics.write_and_reset (i_episode) Settings.summary_writer.add_scalar('Loss/MSE_Loss', avg_loss / count, i_episode) # save and evaluate the policy Settings.logger.info("Saving Checkpoint") torch.save(hand_predictor.state_dict(), Settings.checkpoint_folder + "/" + str(i_episode).zfill(8) + ".pt") Settings.logger.info("Evaluation")