def simulation(self, selected_node): schafkopf_env = SchafkopfEnv() #state, reward, terminal = schafkopf_env.set_state(deepcopy(selected_node.game_state), deepcopy(selected_node.player_hands)) state, reward, terminal = schafkopf_env.set_state( deepcopy(selected_node.game_state), [copy(selected_node.player_hands[i]) for i in range(4)]) while not terminal: action, _ = self.player.act(state) state, reward, terminal = schafkopf_env.step(action) return reward
def expand(self, node): not_visited_actions = copy(node.allowed_actions) for child in node.children: not_visited_actions.remove(child.previous_action) #TODO: check if this should be random or chosen by player policy chosen_action = random.choice(tuple(not_visited_actions)) schafkopf_env = SchafkopfEnv() schafkopf_env.set_state(deepcopy(node.game_state), [copy(node.player_hands[i]) for i in range(4)]) state, _, terminal = schafkopf_env.step(chosen_action) new_node = Node(parent=node, game_state=state["game_state"], previous_action=chosen_action, player_hands=schafkopf_env.player_cards, allowed_actions=state["allowed_actions"]) node.add_child(child_node=new_node) return new_node
def play_against_other_players(checkpoint_folder, model_class, other_player_classes, runs, summary_writer): generations = [int(f[:8]) for f in listdir(checkpoint_folder) if f.endswith(".pt")] max_gen = max(generations) policy = model_class() policy.to(device=Settings.device) policy.load_state_dict(torch.load(checkpoint_folder + "/" + str(max_gen).zfill(8) + ".pt")) for other_player_class in other_player_classes: players = [other_player_class(), RlPlayer(policy), other_player_class(), RlPlayer(policy)] schafkopf_env = SchafkopfEnv(1) all_rewards = np.array([0., 0., 0., 0.]) for j in range(runs): state, reward, terminal = schafkopf_env.reset() while not terminal: action, prob = players[state["game_state"].current_player].act(state) state, reward, terminal = schafkopf_env.step(action, prob) all_rewards += reward all_rewards = all_rewards[[1, 0, 3, 2]] players = [RlPlayer(policy), other_player_class(), RlPlayer(policy), other_player_class()] schafkopf_env = SchafkopfEnv(1) for j in range(runs): state, reward, terminal = schafkopf_env.reset() while not terminal: action, prob = players[state["game_state"].current_player].act(state) state, reward, terminal = schafkopf_env.step(action, prob) all_rewards += reward summary_writer.add_scalar('Evaluation/' + str(other_player_class.__name__), (all_rewards[0] + all_rewards[2]) / (4 * runs), max_gen)
def main(): pimc_player = PIMCPlayer(10, 40, RandomPlayer()) policy = ActorCriticNetworkLSTM().to(Settings.device) policy.load_state_dict(torch.load("../policies/pretrained/lstm-policy.pt")) rl_player = RlPlayer(policy, action_shaping=False, eval=True) hp = HandPredictor().to(Settings.device) hp.load_state_dict(torch.load("../policies/pretrained/hand-predictor.pt")) smart_pimc_player = HPPIMCPlayer(10, 40, RandomPlayer(), HandPredictor().to(Settings.device)) ip = ImmitationPolicy().to(Settings.device) ip.load_state_dict(torch.load("../policies/00010340.pt")) immitation_player = RlPlayer(ip, action_shaping=False, eval=True) participants = [ rl_player, immitation_player, smart_pimc_player, pimc_player, RuleBasedPlayer(), RandomCowardPlayer(), RandomPlayer(), ] number_of_games = 1000 for i in range(len(participants)): for j in range(i + 1, len(participants)): p1 = participants[i] p2 = participants[j] cummulative_reward = [0, 0, 0, 0] for k in range( 2 ): #run the same tournament twice with differen positions of players print(' ') schafkopf_env = SchafkopfEnv(seed=1) if k == 0: players = [p1, p1, p2, p2] else: players = [p2, p2, p1, p1] cummulative_reward.reverse() # tournament loop for game_nr in range(1, number_of_games + 1): state, reward, terminal = schafkopf_env.reset() while not terminal: action, prob = players[ state["game_state"].current_player].act(state) state, reward, terminal = schafkopf_env.step( action, prob) cummulative_reward = [ cummulative_reward[m] + reward[m] for m in range(4) ] if game_nr % 100 == 0: print('.', end='') #schafkopf_env.print_game() print("player " + str(i) + " vs. player " + str(j) + " = " + str((cummulative_reward[2] + cummulative_reward[3]) / (2 * 2 * number_of_games)) + " to " + str((cummulative_reward[0] + cummulative_reward[1]) / (2 * 2 * number_of_games)))
def main(): print("Cuda available: "+str(torch.cuda.is_available())) #start tensorboard tb = program.TensorBoard() tb.configure(argv=[None, '--logdir', Settings.runs_folder]) tb.launch() # set seed for debugging if Settings.random_seed: torch.manual_seed(Settings.random_seed) #loading initial policy policy = Settings.model().to(Settings.device) # take the newest generation available i_episode = max_gen = 0 generations = [int(f[:8]) for f in listdir(Settings.checkpoint_folder) if f.endswith(".pt")] if len(generations) > 0: max_gen = max(generations) policy.load_state_dict(torch.load(Settings.checkpoint_folder+"/" + str(max_gen).zfill(8) + ".pt")) i_episode = max_gen #create ppo ppo = PPO(policy, [Settings.lr, Settings.lr_stepsize, Settings.lr_gamma], Settings.betas, Settings.gamma, Settings.K_epochs, Settings.eps_clip, Settings.batch_size,Settings.mini_batch_size, c1=Settings.c1, c2=Settings.c2, start_episode=max_gen-1 ) #create four players players = [RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old)] #create a game simulation schafkopf_env = SchafkopfEnv(Settings.random_seed) game_statistics = GameStatistics() # training loop for _ in range(0, 90000000): Settings.logger.info("playing " +str(Settings.update_games)+ " games") # play a bunch of games t0 = time.time() for _ in range(Settings.update_games): state, reward, terminal = schafkopf_env.reset() while not terminal: action, prob = players[state["game_state"].current_player].act(state) state, reward, terminal = schafkopf_env.step(action, prob) for p in range(4): players[p].retrieve_reward(reward[p]) i_episode += 1 game_statistics.update_statistics(state["game_state"], reward) t1 = time.time() #update the policy Settings.logger.info("updating policy") player_memories = Memory() for p in players: player_memories.append_memory(p.memory) ppo.update(player_memories, i_episode) t2 = time.time() ppo.lr_scheduler.step(i_episode) # writing game statistics for tensorboard Settings.logger.info("Episode: "+str(i_episode) + " game simulation (s) = "+str(t1-t0) + " update (s) = "+str(t2-t1)) schafkopf_env.print_game() game_statistics.write_and_reset (i_episode) # reset memories and replace policy players = [RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old)] # save and evaluate the policy Settings.logger.info("Saving Checkpoint") torch.save(ppo.policy_old.state_dict(), Settings.checkpoint_folder + "/" + str(i_episode).zfill(8) + ".pt") Settings.logger.info("Evaluation") play_against_other_players(Settings.checkpoint_folder, Settings.model, [RandomPlayer, RandomCowardPlayer, RuleBasedPlayer], Settings.eval_games, Settings.summary_writer)
def sample_player_hands(self, game_state, ego_player_hand, card_probabilities, remaining_cards, needed_player_cards, only_valid=False): valid_card_distribution = False player_cards = None # loop over random card distributions until we found a valid one while not valid_card_distribution: # randomly distribute cards so that each player gets as many as he needs valid_card_distribution = True player_cards = [[], [], [], []] player_cards[game_state.current_player] = ego_player_hand random.shuffle(remaining_cards) card_probs = card_probabilities.clone().detach().cpu() for p in range(4): index = (p - game_state.current_player - 1) % 4 if len(player_cards[p]) >= needed_player_cards[p]: card_probs[:, index] = 0 for card in remaining_cards: card_index = card[1] * 4 + card[ 0] #self.rules.cards.index(card) sample_player = Categorical(card_probs[card_index]).sample() player_id = (game_state.current_player + sample_player + 1) % 4 player_cards[player_id].append(card) if len(player_cards[player_id] ) == needed_player_cards[player_id]: card_probs[:, sample_player] = 0 #from_card = 0 #for i, nededed_cards in enumerate(needed_player_cards): # if i == game_state.current_player: # continue # player_cards[i] = remaining_cards[from_card:from_card + nededed_cards] # from_card += nededed_cards if not only_valid: break # check if with the current card distribution every made move was valid schafkopf_env = SchafkopfEnv() simulation_player_cards = [ player_hand.copy() for player_hand in player_cards ] for i in range(4): simulation_player_cards[i] += [ game_state.course_of_game_playerwise[trick][i] for trick in range(8) if game_state.course_of_game_playerwise[trick][i] != [None, None] ] state, _, _ = schafkopf_env.set_state( PublicGameState(game_state.dealer), simulation_player_cards) while True: eval_game_state, allowed_actions = state["game_state"], state[ "allowed_actions"] if eval_game_state.game_stage == Rules.BIDDING: action = game_state.bidding_round[ eval_game_state.current_player] if action == None: break elif action not in allowed_actions: valid_card_distribution = False break elif eval_game_state.game_stage == Rules.CONTRA: action = game_state.contra[eval_game_state.current_player] if action == None: break elif action not in allowed_actions: valid_card_distribution = False break elif eval_game_state.game_stage == Rules.RETOUR: action = game_state.retour[eval_game_state.current_player] if action == None: break elif action not in allowed_actions: valid_card_distribution = False break else: action = game_state.course_of_game_playerwise[ eval_game_state.trick_number][ eval_game_state.current_player] if action == [None, None]: break elif action not in allowed_actions: valid_card_distribution = False break state, _, _ = schafkopf_env.step(action) return player_cards
def main(): print("Cuda available: "+str(torch.cuda.is_available())) #start tensorboard tb = program.TensorBoard() tb.configure(argv=[None, '--logdir', Settings.runs_folder]) tb.launch() # set seed for debugging if Settings.random_seed: torch.manual_seed(Settings.random_seed) #loading initial policy hand_predictor = HandPredictor().to(Settings.device) # take the newest generation available i_episode = max_gen = 0 generations = [int(f[:8]) for f in listdir(Settings.checkpoint_folder) if f.endswith(".pt")] if len(generations) > 0: max_gen = max(generations) hand_predictor.load_state_dict(torch.load(Settings.checkpoint_folder+"/" + str(max_gen).zfill(8) + ".pt")) i_episode = max_gen optimizer = torch.optim.Adam(hand_predictor.parameters(),lr=Settings.lr, betas=Settings.betas, weight_decay=Settings.optimizer_weight_decay) # training loop for _ in range(0, 90000000): Settings.logger.info("playing " +str(Settings.update_games)+ " games") smart_mcts_player = HPPIMCPlayer(30, 120, RandomPlayer(), hand_predictor) # create four players players = [smart_mcts_player, smart_mcts_player, smart_mcts_player, smart_mcts_player] # create a game simulation schafkopf_env = SchafkopfEnv(Settings.random_seed) game_statistics = GameStatistics() memory_states = [] memory_player_hands = [] # play a bunch of games t0 = time.time() for _ in range(Settings.update_games): state, reward, terminal = schafkopf_env.reset() while not terminal: memory_states.append(hand_predictor.preprocess(state)) #TODO: happens twice now and could be optimized memory_player_hands.append(hand_predictor.encode_player_hands(schafkopf_env.player_cards, state["game_state"].current_player)) action, prob = players[state["game_state"].current_player].act(state) state, reward, terminal = schafkopf_env.step(action, prob) if state["game_state"].game_type[1] == 2: schafkopf_env.print_game() print("game "+str(i_episode)) i_episode += 1 game_statistics.update_statistics(state["game_state"], reward) t1 = time.time() #update the policy Settings.logger.info("updating policy") # Create dataset from collected experiences dataset = PredictionDatasetLSTM(memory_states, memory_player_hands) training_generator = data.DataLoader(dataset, collate_fn=dataset.custom_collate,batch_size=Settings.mini_batch_size, shuffle=True) #logging avg_loss = 0 count = 0 hand_predictor.train() for epoch in range(Settings.K_epochs): # epoch mini_batches_in_batch = int(Settings.batch_size / Settings.mini_batch_size) optimizer.zero_grad() for i, (states, hands) in enumerate(training_generator): # mini batch # Transfer to GPU states = [state.to(Settings.device) for state in states] hands = hands.to(Settings.device) pred = hand_predictor(states) #loss = nn.MSELoss()(pred, hands) #TODO: replace by cross entropy loss = nn.BCELoss()(pred, hands) avg_loss += loss.mean().item() count +=1 loss.mean().backward() if (i + 1) % mini_batches_in_batch == 0: optimizer.step() optimizer.zero_grad() t2 = time.time() hand_predictor.eval() # writing game statistics for tensorboard Settings.logger.info("Episode: "+str(i_episode) + " game simulation (s) = "+str(t1-t0) + " update (s) = "+str(t2-t1)) schafkopf_env.print_game() game_statistics.write_and_reset (i_episode) Settings.summary_writer.add_scalar('Loss/MSE_Loss', avg_loss / count, i_episode) # save and evaluate the policy Settings.logger.info("Saving Checkpoint") torch.save(hand_predictor.state_dict(), Settings.checkpoint_folder + "/" + str(i_episode).zfill(8) + ".pt") Settings.logger.info("Evaluation")
def get_states_actions(game_transcript, policy): states = [] actions = [] schafkopf_env = SchafkopfEnv() state, _, _ = schafkopf_env.set_state(PublicGameState(3), [game_transcript.player_hands[i] for i in range(4)]) states.append(policy.preprocess(state)) #Bidding stage game_player = None game_type = None if len(game_transcript.bidding_round) != 4: # not all said weiter player_bidding = None for i in range(1, 5): if "Vortritt" not in game_transcript.bidding_round[-i]: player_bidding = game_transcript.bidding_round[-i] break if player_bidding.startswith("Ex-Sauspieler"): game_player = game_transcript.player_dict[player_bidding.split(" ")[0] + " " + player_bidding.split(" ")[1]] else: game_player = game_transcript.player_dict[player_bidding.split(" ")[0]] player_bidding = player_bidding.split(' ', 1)[1] #remove player name in case it contains one of the following words if "Hundsgfickte" in player_bidding: game_type = [0, 0] elif "Blaue" in player_bidding: game_type = [2, 0] elif "Alte" in player_bidding: game_type = [3, 0] elif "Schelle" in player_bidding: game_type = [0, 2] elif "Herz" in player_bidding: game_type = [1, 2] elif "Gras" in player_bidding: game_type = [2, 2] elif "Eichel" in player_bidding: game_type = [3, 2] elif "Wenz" in player_bidding: game_type = [None, 1] for i in range(4): action = [None, None] if i == game_player: action = game_type actions.append(preprocess_action(Rules.BIDDING, action)) state, _, _ = schafkopf_env.step(action) if not (len(game_transcript.bidding_round) == 4 and i == 3): #don't take the last state of the game into the dataset states.append(policy.preprocess(state)) if len(game_transcript.bidding_round) != 4: # if not all said weiter con_ret = [game_transcript.player_dict[p] for p in game_transcript.kontra] #CONTRA stage for i in range(4): action = False if len(con_ret) > 0 and i == con_ret[0]: action = True actions.append(preprocess_action(Rules.CONTRA, action)) state, _, _ = schafkopf_env.step(action) states.append(policy.preprocess(state)) # RETOUR stage if len(con_ret) > 0: for i in range(4): action = False if len(con_ret) == 2 and i == con_ret[1]: action = True actions.append(preprocess_action(Rules.RETOUR, action)) state, _, _ = schafkopf_env.step(action) states.append(policy.preprocess(state)) # TRICK stage for trick in range(8): for c in range(4): action = game_transcript.course_of_game[trick][c] actions.append(preprocess_action(Rules.TRICK, action)) state, _, _ = schafkopf_env.step(action) if not (trick == 7 and c == 3): # all but the last state states.append(policy.preprocess(state)) return states, actions
def sample_player_hands(self, game_state, ego_player_hand): # precomputations played_cards = [ card for trick in game_state.course_of_game for card in trick if card != [None, None] ] remaining_cards = [ card for card in self.rules.cards if ((card not in played_cards) and (card not in ego_player_hand)) ] needed_player_cards = [8, 8, 8, 8] for trick in range(game_state.trick_number + 1): for i, card in enumerate( game_state.course_of_game_playerwise[trick]): if card != [None, None]: needed_player_cards[i] -= 1 needed_player_cards[game_state.current_player] = 0 valid_card_distribution = False player_cards = None # loop over random card distributions until we found a valid one while not valid_card_distribution: # randomly distribute cards so that each player gets as many as he needs valid_card_distribution = True player_cards = [[], [], [], []] player_cards[game_state.current_player] = ego_player_hand random.shuffle(remaining_cards) from_card = 0 for i, nededed_cards in enumerate(needed_player_cards): if i == game_state.current_player: continue player_cards[i] = remaining_cards[from_card:from_card + nededed_cards] from_card += nededed_cards # check if with the current card distribution every made move was valid schafkopf_env = SchafkopfEnv() state, _, _ = schafkopf_env.set_state( PublicGameState(game_state.dealer), player_cards) while True: eval_game_state, allowed_actions = state["game_state"], state[ "allowed_actions"] if eval_game_state.game_stage == Rules.BIDDING: action = eval_game_state.bidding_round[ eval_game_state.current_player] if action == None: break elif action not in allowed_actions: valid_card_distribution = False break elif eval_game_state.game_stage == Rules.CONTRA: action = eval_game_state.contra[ eval_game_state.current_player] if action == None: break elif action not in allowed_actions: valid_card_distribution = False break elif eval_game_state.game_stage == Rules.RETOUR: action = eval_game_state.retour[ eval_game_state.current_player] if action == None: break elif action not in allowed_actions: valid_card_distribution = False break else: action = eval_game_state.course_of_game_playerwise[ eval_game_state.trick_number][ eval_game_state.current_player] if action == [None, None]: break elif action not in allowed_actions: valid_card_distribution = False break state, _, _ = schafkopf_env.step(action) return player_cards