def run_model(game_count=1): """ run model for game_count games """ # Make environment env = WhaleEnv( config={ 'active_player': 0, 'seed': datetime.utcnow().microsecond, 'env_num': 1, 'num_players': 5 }) # Set up agents action_num = 3 agent = SimpleAgent(action_num=action_num, player_num=5) agent_0 = RandomAgent(action_num=action_num) agent_1 = RandomAgent(action_num=action_num) agent_2 = RandomAgent(action_num=action_num) agent_3 = RandomAgent(action_num=action_num) agents = [agent, agent_0, agent_1, agent_2, agent_3] env.set_agents(agents) agent.load_pretrained() for game in range(game_count): # Generate data from the environment trajectories = env.run(is_training=False) # Print out the trajectories print('\nEpisode {}'.format(game)) i = 0 for trajectory in trajectories: print('\tPlayer {}'.format(i)) [print(t) for t in trajectory] i += 1
def experiment(): np.random.seed(3) # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) action_space = mdp._mdp_info.action_space observation_space = mdp._mdp_info.observation_space gamma = mdp._mdp_info.gamma # Model Block model_block = MBlock(env=mdp, render=False) #Policy epsilon = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon) table = Table(mdp.info.size) pi.set_q(table) #Agents mdp_info_agent1 = MDPInfo(observation_space=observation_space, action_space=spaces.Discrete(5), gamma=1, horizon=20) mdp_info_agent2 = MDPInfo(observation_space=spaces.Discrete(5), action_space=action_space, gamma=gamma, horizon=10) agent1 = SimpleAgent(name='HIGH', mdp_info=mdp_info_agent1, policy=pi) agent2 = SimpleAgent(name='LOW', mdp_info=mdp_info_agent2, policy=pi) # Control Blocks control_block1 = ControlBlock(wake_time=10, agent=agent1, n_eps_per_fit=None, n_steps_per_fit=1) control_block2 = ControlBlock(wake_time=1, agent=agent2, n_eps_per_fit=None, n_steps_per_fit=1) # Algorithm blocks = [model_block, control_block1, control_block2] order = [0, 1, 2] model_block.add_input(control_block2) control_block1.add_input(model_block) control_block1.add_reward(model_block) control_block2.add_input(control_block1) control_block2.add_reward(model_block) computational_graph = ComputationalGraph(blocks=blocks, order=order) core = HierarchicalCore(computational_graph) # Train core.learn(n_steps=40, quiet=True) return
def a_vs_b(ship_a, ship_b, trials, attack_range): """This function calculates the average time to destruction when a shoots at b. Args: ship_a ((Ship, str)): Attacker and hull zone tuple. ship_b ((Ship, str)): Defender and hull zone tuple. trials (int): Number of trials in average calculation. range (str): Attack range. """ roll_counts = [] agent = SimpleAgent() for trial in range(trials): # Reset ship b for each trial ship_b.reset() world_state = WorldState() world_state.addShip(ship_a, 0) world_state.addShip(ship_b, 1) num_rolls = 0 while ship_b.damage_cards() < ship_b.hull(): num_rolls += 1 # Handle the attack and receive the updated world state world_state = handleAttack(world_state=world_state, attacker=(ship_a, "front"), defender=(ship_b, "front"), attack_range=attack_range, offensive_agent=agent, defensive_agent=agent) roll_counts.append(num_rolls) np_counts = numpy.array(roll_counts) return np_counts.mean()
def test3(): env = ConnectFourEnv(display=True) simpleAgent = SimpleAgent(env, 2, 1) state, gameOver, winner = env.act(1, 0) state, gameOver, winner = env.act(2, 6) time.sleep(0.5) state, gameOver, winner = env.act(1, 1) state, gameOver, winner = env.act(2, 5) time.sleep(0.5) state, gameOver, winner = env.act(1, 3) acttion = simpleAgent.getAction(env.state) state, gameOver, winner = env.act(2, acttion) time.sleep(5)
def __init__(self): # Only four navigation actions for subproblem actions nA = 4 # Only taxi row, column and destination index for subproblem states states_shape = (5, 5, 4) self.sub_agent = SimpleAgent(states_shape=states_shape, nA=nA) # Learning rate / step size self.sub_agent.alpha = 0.01 self.sub_agent.alpha_decay = 1 self.sub_agent.alpha_min = 0 # Discount self.sub_agent.gamma = 1 self.sub_agent.gamma_decay = 1 self.sub_agent.gamma_min = 0 # Exploration self.sub_agent.epsilon = 0.01 self.sub_agent.epsilon_decay = 1 self.sub_agent.epsilon_min = 0 # For our params, just mimic the sub-agent's (self.alpha, self.epsilon, self.gamma) = \ self.sub_agent.alpha, self.sub_agent.epsilon, self.sub_agent.gamma # Environment priors self.action_pickup = 4 self.action_dropoff = 5 self.locs = [(0, 0), (0, 4), (4, 0), (4, 3)] self.passenger_in_taxi_idx = 4 print("alpha: {0}, alpha_decay: {1}, alpha_min: {2}".format( self.sub_agent.alpha, self.sub_agent.alpha_decay, self.sub_agent.alpha_min)) print("gamma: {0}, gamma_decay: {1}, gamma_min: {2}".format( self.sub_agent.gamma, self.sub_agent.gamma_decay, self.sub_agent.gamma_min)) print("epsilon: {0}, epsilon_decay: {1}, epsilon_min: {2}".format( self.sub_agent.epsilon, self.sub_agent.epsilon_decay, self.sub_agent.epsilon_min))
def __init__(self, settings): self.settings = settings self.totalGameNo = settings['total_game_no'] self.playedGameNo = 0 self.simStepNo = settings['sim_step_no'] self.saveStepNo = settings['save_step_no'] self.display = settings['display'] self.env = ConnectFourEnv(self.display) self.visited = {} # (stateStr, turn, action), visited self.won = {} # (stateStr, turn, action), won self.DRAW = -1 self.PLAYER = 1 self.OPP = 2 self.simpleAgent = SimpleAgent(self.env, self.OPP, self.PLAYER) self.winnerResult = {self.DRAW:0, self.PLAYER:0, self.OPP:0} self.greedyEpsilon = 0.1 self.startTime = time.strftime('%Y%m%d_%H%M%S') logFile="output/%s.log" % (self.startTime) util.Logger(logFile) self.testMode = False self.debugger = DebugInput(self).start()
def make_env(n_substeps=5, horizon=250, deterministic_mode=False): ''' This make_env function is not used anywhere; it exists to provide a simple, bare-bones example of how to construct a multi-agent environment using the modules framework. ''' env = Base(n_agents=1, n_substeps=n_substeps, horizon=horizon, floor_size=10, grid_size=50, deterministic_mode=deterministic_mode, env_no=0, action_lims=(-250.0, 250.0)) # Add Walls #env.add_module(RandomWalls(grid_size=5, num_rooms=2, min_room_size=5, door_size=5, low_outside_walls=True, outside_wall_rgba="1 1 1 0.1")) # Add Agents first_agent_placement = custom_placement agent_placement_fn = [first_agent_placement] env.add_module(SimpleAgent(1, placement_fn=agent_placement_fn)) env.reset() keys_self = ['agent_qpos_qvel'] keys_mask_self = [] #['mask_aa_obs'] keys_external = [] #['agent_qpos_qvel'] keys_mask_external = [] keys_copy = [] env = AddConstantObservationsWrapper( env, new_obs={'target_pos': np.full((1, 1), 0.0)}) keys_self += ['target_pos'] env = SimpleWrapper(env) env = SplitMultiAgentActions(env) #env = DiscretizeActionWrapper(env, 'action_movement', nbuckets=21) env = SplitObservations(env, keys_self + keys_mask_self, keys_copy=keys_copy) env = DiscardMujocoExceptionEpisodes(env) env = SelectKeysWrapper(env, keys_self=keys_self, keys_external=keys_external, keys_mask=keys_mask_self + keys_mask_external, flatten=False) return env
def train_model(max_episodes=100): """ Trains a DQN agent to play the CartPole game by trial and error :return: None """ # buffer = ReplayBuffer() # Make environment env = WhaleEnv( config={ 'active_player': 0, 'seed': datetime.utcnow().microsecond, 'env_num': 1, 'num_players': 5 }) # Set up agents action_num = 3 agent = SimpleAgent(action_num=action_num, player_num=5) agent_0 = NoDrawAgent(action_num=action_num) agent_1 = NoDrawAgent(action_num=action_num) agent_2 = NoDrawAgent(action_num=action_num) agent_3 = NoDrawAgent(action_num=action_num) # agent_train = RandomAgent(action_num=action_num) agents = [agent, agent_0, agent_1, agent_2, agent_3] # train_agents = [agent_train, agent_0, agent_1, agent_2, agent_3] env.set_agents(agents) agent.load_pretrained() min_perf, max_perf = 1.0, 0.0 for episode_cnt in range(1, max_episodes + 1): # print(f'{datetime.utcnow()} train ...') loss = agent.train( collect_gameplay_experiences(env, agents, GAME_COUNT_PER_EPISODE)) # print(f'{datetime.utcnow()} eval ...') avg_rewards = evaluate_training_result(env, agents, EVAL_EPISODES_COUNT) # print(f'{datetime.utcnow()} calc ...') if avg_rewards[0] > max_perf: max_perf = avg_rewards[0] agent.save_weight() if avg_rewards[0] < min_perf: min_perf = avg_rewards[0] print('{0:03d}/{1} perf:{2:.2f}(min:{3:.2f} max:{4:.2f})' 'loss:{5:.4f} rewards:{6:.2f} {7:.2f} {8:.2f} {9:.2f}'.format( episode_cnt, max_episodes, avg_rewards[0], min_perf, max_perf, loss[0], avg_rewards[1], avg_rewards[2], avg_rewards[3], avg_rewards[4])) # env.close() print('training end')
def main(_): agent = SimpleAgent() try: while True: with sc2_env.SC2Env( map_name="Simple64", players=[ sc2_env.Agent(sc2_env.Race.zerg), sc2_env.Bot(sc2_env.Race.random, sc2_env.Difficulty.very_easy) ], agent_interface_format=features.AgentInterfaceFormat( action_space=actions.ActionSpace.RAW, use_raw_units=True, raw_resolution=64, ), ) as env: run_loop.run_loop([agent], env) except KeyboardInterrupt: pass
def test4(): env = ConnectFourEnv(display=True) simpleAgent1 = SimpleAgent(env, 1, 2) simpleAgent2 = SimpleAgent(env, 2, 1) state = env.getState() while True: acttion1 = simpleAgent1.getAction(state) state, gameOver, winner = env.act(1, acttion1, True) time.sleep(0.3) if gameOver: break acttion2 = simpleAgent2.getAction(state) state, gameOver, winner = env.act(2, acttion2, True) time.sleep(0.3) if gameOver: break if winner == -1: print 'Game draw' else: print 'Player %s won' % winner time.sleep(5)
U_vi = value_iteration(epsilon=0.001) """ Collects and writes the results to a file for the Random Agent and draws the graph Draws: Mean Reward per Episode vs Episode Number """ random_agent = RandomAgent(env_random) process_data_random(env_random, random_agent, MAX_EPISODES, MAX_ITERS_PER_EPISODE, REWARD_HOLE_SIMPLE, PROBLEM_ID) """ Collects and writes the results for the Simple Agent containing data such as the number of iterations to reach the goal """ simple_agent = SimpleAgent(env_simple) process_data_simple(env_simple, simple_agent, PROBLEM_ID) """ Collects and writes the results to a file for the Q-learning Agent and draws the graphs. Draws: Mean Reward per Episode vs Episode Number Utility Values in each State against Episode Number """ states = [i for i in range(64)] q_learning_agent = QLearningAgent(env_qlearn, NE, RPLUS, GAMMA, ALPHA) U = process_data_q(env_qlearn, q_learning_agent, MAX_EPISODES, MAX_ITERS_PER_EPISODE, states, PROBLEM_ID, REWARD_HOLE_Q) compare_utils(U_vi, U, 'Value itr', 'Q learning')
board = self.drop_piece(board, col, piece) if self.check_if_winning(board, piece): winner = piece break piece = piece % 2 + 1 self.agent1.game_over(winner) self.agent2.game_over(winner) return winner def end(self): self.agent1.teardown() self.agent2.teardown() # run agents config = Config(6, 7, 4) agents = [(RandomAgent(config), "rnd", 5000), (SimpleAgent(config), "simple", 5000), (OneStepLookaheadAgent(config), "1sla", 5000), (OneStepLookaheadAgent(config), "1sla", 5000), (OneStepLookaheadAgent(config), "1sla", 5000), (NStepsLookaheadAgent(config, 2), "2sla", 3000), (NStepsLookaheadAgent(config, 3), "3sla", 5000)] for agent, agent_name, nruns in agents: training = Training(config, agent, CNNAgent(config, Network1(), agent_name)) for n in range(nruns): winner = training.run() print("Agent", agent_name, ", game", n, "- player", winner, "wins") training.end()
def run_training( opponent, mcts_opp, game_state_file, graph_file, model_save_file, mcts_iters, temp, tempsteps, lr, discount, memsize, num_episodes, num_epochs, batch_size, train_every, save_every, graph_every, averaging_window, opt_eps=1e-8, ucb_c=1.5, boardsize=8, inputs=20, render=False, verbose=False, ): env = PommermanEnvironment( render=render, num_agents=2, game_state_file=game_state_file, ) run_settings = RunSettings( num_episodes=num_episodes, num_epochs=num_epochs, batch_size=batch_size, train_every=train_every, save_every=save_every, graph_every=graph_every, averaging_window=averaging_window, graph_file=graph_file, verbose=verbose, ) agent_settings = AgentSettings( optimizer=torch.optim.Adam, learning_rate=lr, opt_eps=opt_eps, epsilon_max=0, epsilon_min=0, epsilon_duration=0, verbose=verbose, ) memory = MCTSMemory(buffer_len=memsize, discount=discount) if mcts_opp is None: mcts_opp = opponent if mcts_opp == 'rand': opp = pommerman.agents.RandomAgent() elif mcts_opp == 'noop': opp = PommermanNoopAgent() elif mcts_opp == 'simp': opp = pommerman.agents.SimpleAgent() else: raise Exception('Invalid MCTS opponent type', mcts_opp) mcts_model = ActorCriticNet(board_size=boardsize, in_channels=inputs) agent1 = MCTSAgent( mcts_iters=mcts_iters, discount=discount, c=ucb_c, temp=temp, tempsteps=tempsteps, agent_id=0, opponent=opp, model_save_file=model_save_file, model=mcts_model, settings=agent_settings, memory=memory, ) agent1.load() if opponent == 'rand': agent2 = RandomAgent() elif opponent == 'noop': agent2 = NoopAgent() elif opponent == 'simp': agent2 = SimpleAgent() else: raise Exception('Invalid opponent type', opponent) experiment = Experiment([agent1, agent2], env, run_settings) experiment.train()
import sys import platform from absl import logging from absl import app from absl import flags from pysc2.env import sc2_env from pysc2.env import run_loop from pysc2.env import remote_sc2_env # !!! LOAD YOUR BOT HERE !!! from simple_agent import SimpleAgent AGENT = SimpleAgent() RACE = sc2_env.Race.protoss STEP_MUL = 8 AGENT_INTERFACE_FORMAT = sc2_env.parse_agent_interface_format( feature_screen=84, feature_minimap=64, rgb_screen=None, rgb_minimap=None, action_space="FEATURES", #FEATURES or RGB use_feature_units=False) # Flags FLAGS = flags.FLAGS flags.DEFINE_integer("GamePort", None, "GamePort") flags.DEFINE_integer("StartPort", None, "StartPort") flags.DEFINE_string("LadderServer", "127.0.0.1", "LadderServer") flags.DEFINE_string("OpponentId", None, "OpponentId")
print("Unrecognized ship name {}".format(args.ship2)) print("Recognized ship names are:\n") for name in ship_templates.keys(): print("\t{}".format(name)) exit(1) for distance in args.ranges: if distance not in ["long", "medium", "short"]: print("Unknown range for ship combat: {}".format(distance)) sys.exit(1) # Set up logging to track what happens during the die rolling. logging.basicConfig(filename='joust.log', level=logging.DEBUG) # Agent for the simulation agent = SimpleAgent() # Loop through all pairs and have them joust for ship_name_1 in first_ship_names: ship_1 = ship.Ship(name=ship_name_1, template=ship_templates[ship_name_1], upgrades=[], player_number=1) for ship_name_2 in second_ship_names: for attack_range in ["long", "medium", "short"]: # Make sure we are actually rolling dice a_colors, a_roll = ship_1.roll("front", attack_range) if 0 < len(a_colors): roll_counts = [] print("{} vs {} at range {}".format(ship_name_1, ship_name_2, attack_range))
class DecompAgent: """ This agent takes advantage of the problem sub-structure by decomposing the root problem into a navigation subproblem (which it solves using a simple Q-learning agent) and using hand-crafted heuristics for all other decisions. """ def __init__(self): # Only four navigation actions for subproblem actions nA = 4 # Only taxi row, column and destination index for subproblem states states_shape = (5, 5, 4) self.sub_agent = SimpleAgent(states_shape=states_shape, nA=nA) # Learning rate / step size self.sub_agent.alpha = 0.01 self.sub_agent.alpha_decay = 1 self.sub_agent.alpha_min = 0 # Discount self.sub_agent.gamma = 1 self.sub_agent.gamma_decay = 1 self.sub_agent.gamma_min = 0 # Exploration self.sub_agent.epsilon = 0.01 self.sub_agent.epsilon_decay = 1 self.sub_agent.epsilon_min = 0 # For our params, just mimic the sub-agent's (self.alpha, self.epsilon, self.gamma) = \ self.sub_agent.alpha, self.sub_agent.epsilon, self.sub_agent.gamma # Environment priors self.action_pickup = 4 self.action_dropoff = 5 self.locs = [(0, 0), (0, 4), (4, 0), (4, 3)] self.passenger_in_taxi_idx = 4 print("alpha: {0}, alpha_decay: {1}, alpha_min: {2}".format( self.sub_agent.alpha, self.sub_agent.alpha_decay, self.sub_agent.alpha_min)) print("gamma: {0}, gamma_decay: {1}, gamma_min: {2}".format( self.sub_agent.gamma, self.sub_agent.gamma_decay, self.sub_agent.gamma_min)) print("epsilon: {0}, epsilon_decay: {1}, epsilon_min: {2}".format( self.sub_agent.epsilon, self.sub_agent.epsilon_decay, self.sub_agent.epsilon_min)) def select_action(self, state): # Override epsilon-greedy exploration for pickup/dropoff if self.can_pick_up(state): return self.action_pickup if self.can_drop_off(state): return self.action_dropoff # Otherwise, defer to the sub-agent transformed_state = self.transform_state(state) return self.sub_agent.select_action(transformed_state) def step(self, state, action, reward, next_state, done): # Transform experience into the problem space of the sub-agent # If the selected action was pickup/dropoff, then experience is not # relevant to sub-problem if action == self.action_pickup or action == self.action_dropoff: return # If we can pickup/dropoff in the next state, then for the # sub-problem we consider next_state to be terminal and the episode # concluded if self.can_pick_up(next_state) or self.can_drop_off(next_state): state_t = self.transform_state(state) action_t = self.transform_action(action) reward_t = 9 # end of episode reward next_state_t = self.transform_state(next_state) done_t = True # Otherwise, transform relatively unchanged for sub-problem else: state_t = self.transform_state(state) action_t = self.transform_action(action) reward_t = -1 next_state_t = self.transform_state(next_state) done_t = False # Pass transformed experience to sub-agent self.sub_agent.step(state_t, action_t, reward_t, next_state_t, done_t) (self.alpha, self.epsilon, self.gamma) = \ self.sub_agent.alpha, self.sub_agent.epsilon, self.sub_agent.gamma def transform_state(self, state): """Transform state into the problem space of the sub-agent""" taxi_row, taxi_col, pass_idx, dest_idx = self.decode_state(state) # If we don't have the passenger, passenger is our destination if pass_idx != self.passenger_in_taxi_idx: dest_idx_t = pass_idx # If we have the passenger, destination is our destination else: dest_idx_t = dest_idx # Encode in subproblem state space and return return (taxi_row, taxi_col, dest_idx_t) def transform_action(self, action): # Action space is the same, minus the final two actions assert action != self.action_pickup assert action != self.action_dropoff return action def can_pick_up(self, state): taxi_row, taxi_col, pass_idx, dest_idx = self.decode_state(state) # Can't pickup if passenger already in taxi if pass_idx == self.passenger_in_taxi_idx: return False # Otherwise, taxi must be colocated with passenger return (taxi_row, taxi_col) == self.locs[pass_idx] def can_drop_off(self, state): taxi_row, taxi_col, pass_idx, dest_idx = self.decode_state(state) # Can't dropoff if passenger not in taxi if pass_idx != self.passenger_in_taxi_idx: return False # Otherwise, taxi must be colocated with destination return (taxi_row, taxi_col) == self.locs[dest_idx] def decode_state(self, i): out = [] out.append(i % 4) i = i // 4 out.append(i % 5) i = i // 5 out.append(i % 5) i = i // 5 out.append(i) assert 0 <= i < 5 taxi_row, taxi_col, pass_idx, dest_idx = reversed(out) return taxi_row, taxi_col, pass_idx, dest_idx
class MCTS: def __init__(self, settings): self.settings = settings self.totalGameNo = settings['total_game_no'] self.playedGameNo = 0 self.simStepNo = settings['sim_step_no'] self.saveStepNo = settings['save_step_no'] self.display = settings['display'] self.env = ConnectFourEnv(self.display) self.visited = {} # (stateStr, turn, action), visited self.won = {} # (stateStr, turn, action), won self.DRAW = -1 self.PLAYER = 1 self.OPP = 2 self.simpleAgent = SimpleAgent(self.env, self.OPP, self.PLAYER) self.winnerResult = {self.DRAW:0, self.PLAYER:0, self.OPP:0} self.greedyEpsilon = 0.1 self.startTime = time.strftime('%Y%m%d_%H%M%S') logFile="output/%s.log" % (self.startTime) util.Logger(logFile) self.testMode = False self.debugger = DebugInput(self).start() def initializeProcesses(self): # Multi process jobs self.multiCpuNo = self.settings['multi_cpu_no'] self.queueList = [] self.processList = [] self.queueChild2Parent = Queue() for i in range(self.multiCpuNo): queueParent2Child = Queue() self.queueList.append(queueParent2Child) #print 'creating a child process[%s]' % i p = Process(target=self.simulateOne, args=(i, self.simStepNo / self.multiCpuNo, queueParent2Child, self.queueChild2Parent)) p.start() self.processList.append(p) def __getstate__(self): d = dict(self.__dict__) del d['queueList'] del d['processList'] del d['queueChild2Parent'] return d def printEnv(self): print 'Start time: %s' % self.startTime print '[ Running Environment ]' for key in self.settings.keys(): print '{} : '.format(key).ljust(30) + '{}'.format(self.settings[key]) print 'width: %s, height: %s' % (self.env.width, self.env.height) def getStateStr(self, state): #return np.array_str(state) return hash(state.tostring()) def simulate(self, orgState): time1 = time.time() for i in range(self.multiCpuNo): self.queueList[i].put((orgState, self.visited, self.won)) finishedChildNo = 0 for i in range(self.multiCpuNo): childID, winnerList, historyList, expandedList = self.queueChild2Parent.get() for expandedNode in expandedList: if expandedNode not in self.visited: self.visited[expandedNode] = 0 self.won[expandedNode] = 0 for winner, history in zip(winnerList, historyList): self.updateTreeInfo(winner, history) finishedChildNo += 1 #print 'simulateOne done %s' % childID if finishedChildNo == self.multiCpuNo: break #print 'all simulateOne finished' time2 = time.time() #print 'simulte took %.2f sec' % (time2 - time1) def simulateOne(self, id, simStepNo, queueParent2Child, queueChild2Parent): while True: orgState, visited, won = queueParent2Child.get() self.visited = visited self.won = won self.env.reset() self.env.setState(orgState) self.visited['haha'] = 'dj' historyList = [] winnerList = [] expandedList = [] state = orgState.copy() turn = self.PLAYER history = [] expanded = False for i in range(simStepNo): if turn == self.PLAYER: availableActions = self.env.availableActions(state) stateStr = self.getStateStr(state) totalStateVisited = 0 # check every actions are visited before for action in availableActions: stateActionPair = (stateStr, turn, action) if stateActionPair in self.visited: totalStateVisited += self.visited[stateActionPair] else: totalStateVisited = 0 if totalStateVisited == 0: action = self.getRandomAction(state) else: maxUpperBound = 0 for action in availableActions: stateActionPair = (stateStr, turn, action) won = self.won.get(stateActionPair, 0) visited = max(self.visited.get(stateActionPair, 1), 1) winRatio = float(won) / visited upperBound = winRatio + math.sqrt(2 * math.log(totalStateVisited) / visited) if upperBound >= maxUpperBound: maxUpperBound = upperBound selectedAction = action action = selectedAction elif turn == self.OPP: if 'sim_opp_policy' in self.settings and self.settings['sim_opp_policy'] == 'simple': action = self.simpleAgent.getAction(state) else: action = self.getRandomAction(state) stateStr = self.getStateStr(state) stateActionPair = (stateStr, turn, action) if expanded == False and stateActionPair not in self.visited: canExpand = True expanded = True else: canExpand = False state, gameOver, winner = self.doAction(state, action, turn, history, expandedList, canExpand, False) if turn == self.PLAYER: turn = self.OPP else: turn = self.PLAYER if gameOver: self.updateTreeInfo(winner, history) historyList.append(history) winnerList.append(winner) # restart sim self.env.reset() self.env.setState(orgState) state = orgState.copy() turn = self.PLAYER history = [] expanded = False continue queueChild2Parent.put((id, winnerList, historyList, expandedList)) def getRandomAction(self, state, availableActions=None): if availableActions == None: availableActions = self.env.availableActions(state) actionIndex = random.randint(0, len(availableActions)-1) return availableActions[actionIndex] def getAction(self, state, turn): availableActions = self.env.availableActions(state) if len(availableActions) == 1: return availableActions[0] maxAction = -1 maxWinRatio = 0 availableActions = self.env.availableActions(state) stateStr = self.getStateStr(state) for action in availableActions: stateActionPair = (stateStr, turn, action) if stateActionPair not in self.visited: continue winRatio = float(self.won.get(stateActionPair, 0)) / max(self.visited.get(stateActionPair, 1), 1) if winRatio >= maxWinRatio: maxWinRatio = winRatio maxAction = action return maxAction def doAction(self, state, action, turn, history, expandedList, canExpand, display): newState, gameOver, winner = self.env.act(turn, action, display) stateStr = self.getStateStr(state) stateActionPair = (stateStr, turn, action) if stateActionPair not in self.visited and canExpand: self.visited[stateActionPair] = 0 self.won[stateActionPair] = 0 if expandedList != None: expandedList.append(stateActionPair) history.append(stateActionPair) return newState, gameOver, winner def updateTreeInfo(self, winner, history): """ Update win result from the current node to the top node """ for stateActionPair in history: if stateActionPair in self.visited: self.visited[stateActionPair] += 1 _, turn, _ = stateActionPair if turn == winner: self.won[stateActionPair] += 1 def printHistory(self, history): step = 0 print '\n[ history ]' for stateActionPair in history: state, turn, action = stateActionPair if stateActionPair in self.visited: visited = self.visited[stateActionPair] won = self.won[stateActionPair] else: visited = 0 won = 0 print 'step[%s] turn=%s, action=%s, visited=%s, won=%s' % \ (step, turn, action, visited, won) step += 1 print '' def printResult(self): print 'total states: %s' % len(self.visited) def save(self, step): if os.path.exists('snapshot') == False: os.makedirs('snapshot') fileName = 'snapshot/mcts_%s' % step with open(fileName + '.pickle', 'wb') as f: pickle.dump(self, f) def gogo(self): self.initializeProcesses() lastResult = [] lastResultWin = 0 for i in range(self.totalGameNo): self.env.reset() state = self.env.getState() history = [] turn = random.randint(self.PLAYER, self.OPP) startTime = time.time() while True: if turn == self.PLAYER: self.simulate(state) if settings['player_action'] == 'egreedy': action = self.getActionEGreedy(state, self.PLAYER) else: action = self.getAction(state, self.PLAYER) elif turn == self.OPP: if settings['opponent'] == 'user': action = self.env.getManualAction(state) else: action = self.simpleAgent.getAction(state) state, gameOver, winner = self.doAction(state, action, turn, history, None, True, True) if gameOver: break if turn == self.PLAYER: turn = self.OPP else: turn = self.PLAYER elapsed = time.time() - startTime if settings['opponent'] == 'user': self.env.showWinner(winner) self.playedGameNo += 1 self.winnerResult[winner] += 1 if winner == -1: print 'Game draw' else: mcts.updateTreeInfo(winner, history) if winner == self.PLAYER: lastResultWin += 1 if len(lastResult) == 100: todel = lastResult.pop(0) if todel == 1: lastResultWin -= 1 lastResult.append(winner) lastRatio = float(lastResultWin) * 100 / len(lastResult) #mcts.printResult() winRatio = float(self.winnerResult[self.PLAYER]) * 100 \ / (self.winnerResult[self.OPP] + self.winnerResult[self.PLAYER]) if winner == 1: winStr = 'Win' else: winStr = 'Lose' print 'Game %s : %s, %s, total=%.0f%%, last 100=%.0f%%, %.1fs' % (self.playedGameNo, self.winnerResult, winStr, winRatio, lastRatio, elapsed) if self.playedGameNo % self.saveStepNo == 0: self.save(self.playedGameNo) #time.sleep(5) self.debugger.finish()