agent_2 = A2C_agent(input_dim=3, lam=0.8, gamma=0.99, lr=1e-4) param = torch.load('a2c_2_param2.pt') agent_2.model.load_state_dict(param) while (1): for event in pygame.event.get(): # 閉じるボタンが押されたら終了 if event.type == QUIT: pygame.display.quit() # Pygameの終了(画面閉じられる) pygame.quit() sys.exit() # black 1 white 2 board, changeable_Pos, Position_Row, Position_Col, Change_Position, done = othello.make( ) game.updateBoard(board) for _ in range(100): #setrow, setcol = agent_1.take_action(board, changeable_Pos, Position_Row, Position_Col, Change_Position) setrow, setcol = agent_1.take_determ_action( board, changeable_Pos, Position_Row, Position_Col, Change_Position) board, changeable_Pos, Position_Row, Position_Col, Change_Position, done = othello.step( setrow, setcol) game.updateBoard(board) if done: time.sleep(0.5) break
def get_play_data(agent_1, agent_2): othello = Othello() first_states = deque() first_rewards = deque() first_actions = deque() second_states = deque() second_rewards = deque() second_actions = deque() board, changeable_Pos, Position_Row, Position_Col, Change_Position, done = othello.make( ) while not done: if othello.color == 1: state = getState(board, changeable_Pos, Position_Row, Position_Col, Change_Position) first_states.appendleft(state) reward_1, reward_2 = getReward(board) first_rewards.appendleft(reward_1) setrow, setcol = agent_1.take_action(board, changeable_Pos, Position_Row, Position_Col, Change_Position) board, changeable_Pos, Position_Row, Position_Col, Change_Position, done = othello.step( setrow, setcol) first_actions.appendleft(8 * setrow + setcol) else: state = getState(board, changeable_Pos, Position_Row, Position_Col, Change_Position) second_states.appendleft(state) reward_1, reward_2 = getReward(board) second_rewards.appendleft(reward_2) setrow, setcol = agent_2.take_action(board, changeable_Pos, Position_Row, Position_Col, Change_Position) board, changeable_Pos, Position_Row, Position_Col, Change_Position, done = othello.step( setrow, setcol) second_actions.appendleft(8 * setrow + setcol) state = getState(board, changeable_Pos, Position_Row, Position_Col, Change_Position) reward_1, reward_2 = getReward(board) first_states.appendleft(state) second_states.appendleft(state) first_states = torch.FloatTensor(first_states) second_states = torch.FloatTensor(second_states) first_actions = torch.FloatTensor(first_actions) second_actions = torch.FloatTensor(second_actions) first_rewards.appendleft(reward_1) second_rewards.appendleft(reward_2) first_rewards = torch.FloatTensor(list(first_rewards)[:-1]) second_rewards = torch.FloatTensor(list(second_rewards)[:-1]) discount_rate = 1 if reward_1 > reward_2: first_values = torch.FloatTensor( [discount_rate**i for i in range(len(first_rewards))]) second_values = torch.FloatTensor( [-discount_rate**i for i in range(len(second_rewards))]) elif reward_1 < reward_2: first_values = torch.FloatTensor( [-discount_rate**i for i in range(len(first_rewards))]) second_values = torch.FloatTensor( [discount_rate**i for i in range(len(second_rewards))]) else: first_values = torch.FloatTensor( [0 for i in range(len(first_rewards))]) second_values = torch.FloatTensor( [0 for i in range(len(second_rewards))]) data_first = { 'states': first_states, 'rewards': first_rewards, 'actions': first_actions, 'values': first_values, } data_secound = { 'states': second_states, 'rewards': second_rewards, 'actions': second_actions, 'values': second_values, } return data_first, data_secound