class GameEnv(object): def __init__(self, level='env/level.csv'): self.game = Game(level) self.repeat_frame_skip = 4 def reset(self): self.game.reset() state = self.game.state() self.agent_coord = state['coord'] return state def step(self, action): for _ in range(self.repeat_frame_skip): self.game.step(action) state = self.game.state() dead = state['dead'] goal = state['goal'] coord = state['coord'] reward = -1 + (coord[0] - self.agent_coord[0]) + 100 * goal - 100 * dead done = dead or goal self.agent_coord = coord return state, reward, done, { 'goal': goal, 'dead': dead, 'distance': self.agent_coord[0] } def render(self, mode='rgb_array'): pixels = self.game.render(mode) pixels = np.swapaxes(pixels, 0, 1) return pixels
if __name__ == "__main__": start_iter = 50000 init_checkpoint = None num_epochs = 2000001 dim_states = 52 rl = RL(dim_states, lr_a=0.0001, lr_c=0.0001, init_checkpoint=init_checkpoint) # fine-tune if start_iter != 0 and not init_checkpoint: rl.load_model('rl', start_iter) env = Game() for episode in range(start_iter, num_epochs): env.reset() print() history_vec = [] history_pid = [] while 1: pid = env.now_player_id # 无人叫地主 or 游戏结束,记录所有存档 if env.landlord_count == 3 or env.winner >= 0: for i in range(3): state, f_reward, y_reward, act_ids, dyn_vec, _, label_mask, attn_mask = env.observe(pid) print('玩家', pid, '获得奖励', y_reward) pid = (pid + 1) % 3 env.now_player_id = pid break