Example #1
0
    def acts(self, states):
        obs = torch.cat(
            tuple(Translator.encode_board(s).unsqueeze(0) for s in states), 0)
        valids = [
            torch.tensor(list(
                map(Translator.encode_move_idx, s.micro_legal_moves)),
                         dtype=torch.long) for s in states
        ]
        policy = self.net.forward(obs)[0]
        prob = [policy[i, valids[i]] for i in range(len(states))]
        prob = [prob[i] / prob[i].sum() for i in range(len(states))]
        idxs = [
            valids[i][dist.Categorical(prob[i]).sample()]
            for i in range(len(states))
        ]

        return [
            Translator.decode_move(idxs[i].item(), states[i])
            for i in range(len(states))
        ]
Example #2
0
    def extract_episodes(self):
        collected = collect_episodes(
            ReinforcementAI(self.judge), episode_cnt, episode_length)

        episodes = []

        for c in collected:
            obs = torch.cat(
                tuple(Translator.encode_board(b).unsqueeze(0)
                      for b in c['boards']), 0)
            idx = torch.tensor(
                tuple(Translator.encode_move_idx(m) for m in c['moves']),
                dtype=torch.long)
            prob = self.judge.forward(obs)[0][
                torch.arange(idx.size()[0], dtype=torch.long), idx]
            ext = c['extrinsic']
            episodes += [
                {'obs': obs, 'idx': idx, 'prob': prob, 'ext': ext,
                 'over': c['boards'][-1].is_game_over()}]
        
        return episodes