def simulate_many_games(policy1, policy2, positions): """Simulates many games in parallel, utilizing GPU parallelization to run the policy network for multiple games simultaneously. policy1 is black; policy2 is white.""" # Assumes that all positions are on the same move number. May not be true # if, say, we are exploring multiple MCTS branches in parallel while positions[0].n <= POLICY_CUTOFF_DEPTH + POLICY_FINISH_MOVES: black_to_play = [pos for pos in positions if pos.to_play == go.BLACK] white_to_play = [pos for pos in positions if pos.to_play == go.WHITE] for policy, to_play in ((policy1, black_to_play), (policy2, white_to_play)): all_move_probs = policy.run_many(bulk_extract_features(to_play)) for i, pos in enumerate(to_play): if pos.n < 30: move = select_weighted_random( pos, np.reshape(all_move_probs[i], (go.N, go.N))) else: move = select_most_likely( pos, np.reshape(all_move_probs[i], (go.N, go.N))) pos.play_move(move, mutate=True, move_prob=all_move_probs[i]) for pos in positions: simulate_game_random(pos) return positions
def run_many(self, positions): imgs = features.bulk_extract_features(positions) imgs[:][..., 16] = (imgs[:][..., 16] - 0.5) * 2 move_probabilities, value = self.sess.run( [self.model.predictions, self.model.value], feed_dict={self.img: imgs}) return move_probabilities.reshape([-1, self.img_row, self.img_col]), value
def from_positions_w_context(positions_w_context, is_test=False): positions, next_moves, results = zip(*positions_w_context) extracted_features = bulk_extract_features(positions) encoded_moves = make_onehot(next_moves) return DataSet(extracted_features, encoded_moves, results, is_test=is_test)
def suggest_move_prob(self, position, iters=2): start = time.time() if self.parent is None: # is the ture root node right after None initialization move_probs, _ = self.policy_network.run_many( bulk_extract_features([position])) self.position = position self.expand(move_probs[0]) self.tree_search(iters=iters) print(f"Searched {iters} iters for {(time.time() - start)} seconds", file=sys.stderr) return self.move_prob()
def reinforce(self, positions, direction): ''' This method is trying to reinforce self-play result, direction being +1 meaning positive reinforcement. ''' imgs = features.bulk_extract_features(positions) feed_dict = {self.img: imgs, self.reinforce_dir: direction} _, l, summary, temp, global_norm = self.sess.run([self.model.train_op, \ self.model.cost, self.merged, \ self.model.temp,self.model.norm], feed_dict=feed_dict) self.train_writer.add_summary(summary, i) self.sess.run(self.model.increase_global_step) print('Self-play reinforcement direction {} | Training loss {:.2f} | Temperatur {:.2f} | Magnitude of global norm {} | Total step {}'.format(direction,\ l,temp,global_norm,\ self.sess.run(self.model.global_step)))
def start_tree_search(self): # add virtual loss self.virtual_loss_do() if not self.is_expanded(): # leaf node position = self.compute_position() # lift virtual loss self.virtual_loss_undo() if position is None: #print("illegal move!", file=sys.stderr) # See go.Position.play_move for notes on detecting legality # In Go, illegal move means loss (or resign) self.backup_value_single(-1) return -1 * -1 #print(f"Investigating following position:\n{position} at height {self.tree_heigh}", file=sys.stderr) sleep(0.1) move_probs, value = self.policy_network.run_many( bulk_extract_features([position])) #self.expand(dirichlet([1]*362)) self.expand(move_probs[0]) self.backup_value_single(value[0, 0]) return value[0, 0] * -1 else: ''' all_action_score = map(lambda node: node.action_score, self.children.values()) move2QU = {move:action_score for move,action_score in zip(self.children.keys(),all_action_score)} select_move = max(move2QU, key=move2QU.get) value = self.children[select_move].start_tree_search() self.backup_value_single(value) ''' all_action_score = map(lambda zipped: zipped[0].Q + zipped[0].U*(0.75+0.25*(zipped[1])/(zipped[0].prior+1e-8)),\ zip(self.children.values(),dirichlet([0.03]*362))) move2action_score = { move: action_score for move, action_score in zip(self.children.keys(), all_action_score) } select_move = max(move2action_score, key=move2action_score.get) #print(f'Children move {select_move} with action score {move2action_score[select_move]}') #value = self.children[(np.random.randint(19),np.random.randint(19))].start_tree_search() value = self.children[select_move].start_tree_search() # lift virtual loss self.virtual_loss_undo() self.backup_value_single(value) return value * -1
def multi_tree_search(self, root, iters=1600): print("tree search", file=sys.stderr) pool = Pool() # selection results = [None]*iters chosen_leaves = [] select = lambda root:root.select_leaf_dirichlet() for i in range(iters): results.append(pool.apply_async(select,args=(root,))) for i in range(iters): chosen_leaf = results[i].get() position = chosen_leaf.compute_position() if position is None: print("illegal move!", file=sys.stderr) # See go.Position.play_move for notes on detecting legality del chosen_leaf.parent.children[chosen_leaf.move] continue chosen_leaves.append(chosen_leaf) print("Investigating following position:\n%s" % (chosen_leaf.position,), file=sys.stderr) # evaluation expand = lambda leaf,probs:leaf.expand(probs) backup = lambda leaf,value:leaf.backup_value(value) for batch in list(split(range(len(chosen_leaves)),8)): batch_leaves = [chosen_leaves[i] for i in batch] leaf_positions = [batch_leaves[i].position for i in range(len(batch_leaves))] move_probs,values = self.policy_network.evaluate_node(bulk_extract_features(leaf_positions,dihedral=True)) perspective = [] for leaf_position in leaf_positions: perspective = 1 if leaf_position.to_play == root.position.to_play else -1 perspectives.append(perspective) values = values*np.asarray(perspectives) # expansion & backup pool.map(expand,zip(batch_leaves,move_probs)) pool.map(backup,zip(batch_leaves,values)) for i in range(len(batch_leaves)): #batch_leaves[i].expand(move_probs[i]) print("value: %s" % values[i], file=sys.stderr) #batch_leaves[i].backup_value(values[i]) pool.close() pool.join() sys.stderr.flush()
def suggest_move_prob(self, position, iters=1600): """Async tree search controller""" global LOOP start = time.time() if self.parent is None: move_probs, _ = self.api.run_many(bulk_extract_features([position])) self.position = position self.expand(move_probs[0]) coroutine_list = [] for _ in range(iters): coroutine_list.append(self.tree_search()) coroutine_list.append(self.api.prediction_worker()) LOOP.run_until_complete(asyncio.gather(*coroutine_list)) logger.debug(f"Searched for {(time.time() - start):.5f} seconds") return self.move_prob()
def from_positions_w_context(positions_w_context, is_test=False, extract_move_prob=False): positions, next_moves, results = zip(*positions_w_context) extracted_features = bulk_extract_features(positions) if extract_move_prob: encoded_moves = np.asarray(next_moves) else: encoded_moves = make_onehot(next_moves) wrt_result = [ -1 if (positions[i].to_play == 1) ^ ('B' in results[i].result) else 1 for i in range(len(results)) ] return DataSet(extracted_features, encoded_moves, wrt_result, is_test=is_test)
def from_positions_w_context(positions_w_context, is_test=False, extract_move_prob=False): positions, next_moves, results = zip(*positions_w_context) extracted_features = bulk_extract_features(positions) if extract_move_prob: encoded_moves = np.asarray(next_moves) else: encoded_moves = make_onehot(next_moves) '''Ackowledge results = (metadata(result,handicap,boardsize),...,metadata(result,handicap,boardsize))''' whowin, turn = 1 if 'B' in results[0].result else -1, 1 wrt_result = [None] * len(self.results) for i in range(len(wrt_result)): wrt_result[i] = int(whowin == turn) turn *= -1 return DataSet(extracted_features, encoded_moves, wrt_result, is_test=is_test)
def start_tree_search(self): if not self.is_expanded(): # leaf node position = self.compute_position() if position is None: #print("illegal move!", file=sys.stderr) # See go.Position.play_move for notes on detecting legality # In Go, illegal move means loss (or resign) self.backup_value_single(-1) return -1 * -1 #print("Investigating following position:\n%s" % (position), file=sys.stderr) move_probs, value = self.policy_network.run_many( bulk_extract_features([position])) self.expand(move_probs[0]) self.backup_value_single(value[0, 0]) return value[0, 0] * -1 else: ''' all_action_score = map(lambda node: node.action_score, self.children.values()) move2QU = {move:action_score for move,action_score in zip(self.children.keys(),all_action_score)} select_move = max(move2QU, key=move2QU.get) value = self.children[select_move].start_tree_search() self.backup_value_single(value) ''' all_action_score = map(lambda zipped: zipped[0].Q + zipped[0].U*(0.75+0.25*(zipped[1])/(zipped[0].prior+1e-8)),\ zip(self.children.values(),dirichlet([0.03]*362))) move2action_score = { move: action_score for move, action_score in zip(self.children.keys(), all_action_score) } select_move = max(move2action_score, key=move2action_score.get) self.children[select_move].virtual_loss(add=True) value = self.children[select_move].start_tree_search() self.children[select_move].virtual_loss(add=False) self.backup_value_single(value) return value * -1
def suggest_move(self, position): move_probabilities = self.policy_network.run_many( bulk_extract_features([position]))[0][0] on_board_move_prob = np.reshape(move_probabilities[:-1], (go.N, go.N)) return select_weighted_random(position, on_board_move_prob)