コード例 #1
0
def play_against_other_players(checkpoint_folder, model_class, other_player_classes, runs, summary_writer):

  generations = [int(f[:8]) for f in listdir(checkpoint_folder) if f.endswith(".pt")]
  max_gen = max(generations)
  policy = model_class()
  policy.to(device=Settings.device)
  policy.load_state_dict(torch.load(checkpoint_folder + "/" + str(max_gen).zfill(8) + ".pt"))

  for other_player_class in other_player_classes:

    players = [other_player_class(), RlPlayer(policy), other_player_class(), RlPlayer(policy)]
    schafkopf_env = SchafkopfEnv(1)

    all_rewards = np.array([0., 0., 0., 0.])
    for j in range(runs):
      state, reward, terminal = schafkopf_env.reset()
      while not terminal:
        action, prob = players[state["game_state"].current_player].act(state)
        state, reward, terminal = schafkopf_env.step(action, prob)

      all_rewards += reward

    all_rewards = all_rewards[[1, 0, 3, 2]]

    players = [RlPlayer(policy), other_player_class(), RlPlayer(policy), other_player_class()]
    schafkopf_env = SchafkopfEnv(1)

    for j in range(runs):
      state, reward, terminal = schafkopf_env.reset()
      while not terminal:
        action, prob = players[state["game_state"].current_player].act(state)
        state, reward, terminal = schafkopf_env.step(action, prob)

      all_rewards += reward

    summary_writer.add_scalar('Evaluation/' + str(other_player_class.__name__),
                              (all_rewards[0] + all_rewards[2]) / (4 * runs), max_gen)
コード例 #2
0
def main():

    pimc_player = PIMCPlayer(10, 40, RandomPlayer())

    policy = ActorCriticNetworkLSTM().to(Settings.device)
    policy.load_state_dict(torch.load("../policies/pretrained/lstm-policy.pt"))
    rl_player = RlPlayer(policy, action_shaping=False, eval=True)

    hp = HandPredictor().to(Settings.device)
    hp.load_state_dict(torch.load("../policies/pretrained/hand-predictor.pt"))
    smart_pimc_player = HPPIMCPlayer(10, 40, RandomPlayer(),
                                     HandPredictor().to(Settings.device))

    ip = ImmitationPolicy().to(Settings.device)
    ip.load_state_dict(torch.load("../policies/00010340.pt"))
    immitation_player = RlPlayer(ip, action_shaping=False, eval=True)

    participants = [
        rl_player,
        immitation_player,
        smart_pimc_player,
        pimc_player,
        RuleBasedPlayer(),
        RandomCowardPlayer(),
        RandomPlayer(),
    ]

    number_of_games = 1000

    for i in range(len(participants)):
        for j in range(i + 1, len(participants)):
            p1 = participants[i]
            p2 = participants[j]

            cummulative_reward = [0, 0, 0, 0]
            for k in range(
                    2
            ):  #run the same tournament twice with differen positions of players
                print(' ')
                schafkopf_env = SchafkopfEnv(seed=1)
                if k == 0:
                    players = [p1, p1, p2, p2]
                else:
                    players = [p2, p2, p1, p1]
                    cummulative_reward.reverse()

                # tournament loop
                for game_nr in range(1, number_of_games + 1):
                    state, reward, terminal = schafkopf_env.reset()
                    while not terminal:
                        action, prob = players[
                            state["game_state"].current_player].act(state)
                        state, reward, terminal = schafkopf_env.step(
                            action, prob)

                    cummulative_reward = [
                        cummulative_reward[m] + reward[m] for m in range(4)
                    ]

                    if game_nr % 100 == 0:
                        print('.', end='')
                    #schafkopf_env.print_game()

            print("player " + str(i) + " vs. player " + str(j) + " = " +
                  str((cummulative_reward[2] + cummulative_reward[3]) /
                      (2 * 2 * number_of_games)) + " to " +
                  str((cummulative_reward[0] + cummulative_reward[1]) /
                      (2 * 2 * number_of_games)))
コード例 #3
0
def main():

  print("Cuda available: "+str(torch.cuda.is_available()))

  #start tensorboard
  tb = program.TensorBoard()
  tb.configure(argv=[None, '--logdir', Settings.runs_folder])
  tb.launch()

  # set seed for debugging
  if Settings.random_seed:
      torch.manual_seed(Settings.random_seed)

  #loading initial policy
  policy = Settings.model().to(Settings.device)
  # take the newest generation available
  i_episode = max_gen = 0
  generations = [int(f[:8]) for f in listdir(Settings.checkpoint_folder) if f.endswith(".pt")]
  if len(generations) > 0:
      max_gen = max(generations)
      policy.load_state_dict(torch.load(Settings.checkpoint_folder+"/" + str(max_gen).zfill(8) + ".pt"))
      i_episode = max_gen
  #create ppo
  ppo = PPO(policy, [Settings.lr, Settings.lr_stepsize, Settings.lr_gamma], Settings.betas, Settings.gamma, Settings.K_epochs, Settings.eps_clip, Settings.batch_size,Settings.mini_batch_size, c1=Settings.c1, c2=Settings.c2, start_episode=max_gen-1  )

  #create four players
  players = [RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old)]
  #create a game simulation
  schafkopf_env = SchafkopfEnv(Settings.random_seed)
  game_statistics = GameStatistics()

  # training loop
  for _ in range(0, 90000000):
    Settings.logger.info("playing " +str(Settings.update_games)+ " games")

    # play a bunch of games
    t0 = time.time()
    for _ in range(Settings.update_games):
      state, reward, terminal = schafkopf_env.reset()
      while not terminal:
        action, prob = players[state["game_state"].current_player].act(state)
        state, reward, terminal = schafkopf_env.step(action, prob)
      for p in range(4):
        players[p].retrieve_reward(reward[p])
      i_episode += 1
      game_statistics.update_statistics(state["game_state"], reward)
    t1 = time.time()

    #update the policy
    Settings.logger.info("updating policy")

    player_memories = Memory()
    for p in players:
      player_memories.append_memory(p.memory)

    ppo.update(player_memories, i_episode)
    t2 = time.time()
    ppo.lr_scheduler.step(i_episode)

    # writing game statistics for tensorboard
    Settings.logger.info("Episode: "+str(i_episode) + " game simulation (s) = "+str(t1-t0) + " update (s) = "+str(t2-t1))
    schafkopf_env.print_game()
    game_statistics.write_and_reset (i_episode)

    # reset memories and replace policy
    players = [RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old)]

    # save and evaluate the policy
    Settings.logger.info("Saving Checkpoint")
    torch.save(ppo.policy_old.state_dict(), Settings.checkpoint_folder + "/" + str(i_episode).zfill(8) + ".pt")
    Settings.logger.info("Evaluation")
    play_against_other_players(Settings.checkpoint_folder, Settings.model, [RandomPlayer, RandomCowardPlayer, RuleBasedPlayer], Settings.eval_games,
                               Settings.summary_writer)
コード例 #4
0
def main():

  print("Cuda available: "+str(torch.cuda.is_available()))

  #start tensorboard
  tb = program.TensorBoard()
  tb.configure(argv=[None, '--logdir', Settings.runs_folder])
  tb.launch()

  # set seed for debugging
  if Settings.random_seed:
      torch.manual_seed(Settings.random_seed)

  #loading initial policy
  hand_predictor = HandPredictor().to(Settings.device)
  # take the newest generation available
  i_episode = max_gen = 0
  generations = [int(f[:8]) for f in listdir(Settings.checkpoint_folder) if f.endswith(".pt")]
  if len(generations) > 0:
      max_gen = max(generations)
      hand_predictor.load_state_dict(torch.load(Settings.checkpoint_folder+"/" + str(max_gen).zfill(8) + ".pt"))
      i_episode = max_gen

  optimizer = torch.optim.Adam(hand_predictor.parameters(),lr=Settings.lr, betas=Settings.betas, weight_decay=Settings.optimizer_weight_decay)

  # training loop
  for _ in range(0, 90000000):
    Settings.logger.info("playing " +str(Settings.update_games)+ " games")

    smart_mcts_player = HPPIMCPlayer(30, 120, RandomPlayer(), hand_predictor)
    # create four players
    players = [smart_mcts_player, smart_mcts_player, smart_mcts_player, smart_mcts_player]
    # create a game simulation
    schafkopf_env = SchafkopfEnv(Settings.random_seed)
    game_statistics = GameStatistics()


    memory_states = []
    memory_player_hands = []

    # play a bunch of games
    t0 = time.time()
    for _ in range(Settings.update_games):
      state, reward, terminal = schafkopf_env.reset()

      while not terminal:
        memory_states.append(hand_predictor.preprocess(state)) #TODO: happens twice now and could be optimized
        memory_player_hands.append(hand_predictor.encode_player_hands(schafkopf_env.player_cards, state["game_state"].current_player))

        action, prob = players[state["game_state"].current_player].act(state)
        state, reward, terminal = schafkopf_env.step(action, prob)

        if state["game_state"].game_type[1] == 2:
          schafkopf_env.print_game()

      print("game "+str(i_episode))
      i_episode += 1
      game_statistics.update_statistics(state["game_state"], reward)
    t1 = time.time()

    #update the policy
    Settings.logger.info("updating policy")
    # Create dataset from collected experiences
    dataset = PredictionDatasetLSTM(memory_states, memory_player_hands)
    training_generator = data.DataLoader(dataset, collate_fn=dataset.custom_collate,batch_size=Settings.mini_batch_size, shuffle=True)

    #logging
    avg_loss = 0
    count = 0

    hand_predictor.train()
    for epoch in range(Settings.K_epochs):  # epoch

      mini_batches_in_batch = int(Settings.batch_size / Settings.mini_batch_size)
      optimizer.zero_grad()

      for i, (states, hands) in enumerate(training_generator):  # mini batch
        # Transfer to GPU
        states = [state.to(Settings.device) for state in states]
        hands = hands.to(Settings.device)
        pred = hand_predictor(states)
        #loss = nn.MSELoss()(pred, hands) #TODO: replace by cross entropy
        loss = nn.BCELoss()(pred, hands)

        avg_loss += loss.mean().item()
        count +=1

        loss.mean().backward()

        if (i + 1) % mini_batches_in_batch == 0:
          optimizer.step()
          optimizer.zero_grad()
    t2 = time.time()
    hand_predictor.eval()

    # writing game statistics for tensorboard
    Settings.logger.info("Episode: "+str(i_episode) + " game simulation (s) = "+str(t1-t0) + " update (s) = "+str(t2-t1))
    schafkopf_env.print_game()
    game_statistics.write_and_reset (i_episode)
    Settings.summary_writer.add_scalar('Loss/MSE_Loss', avg_loss / count, i_episode)

    # save and evaluate the policy
    Settings.logger.info("Saving Checkpoint")
    torch.save(hand_predictor.state_dict(), Settings.checkpoint_folder + "/" + str(i_episode).zfill(8) + ".pt")
    Settings.logger.info("Evaluation")