def update_variables(): MAX_HIT_POINTS = int(get_config("MainInfo")['max_hit_points']) MAX_ANGLE = int(get_config("MainInfo")['max_angle']) * math.pi / 180 MAX_VELOCITY = int(get_config("MainInfo")['max_velocity']) BOARD_WIDTH = int(get_config("MainInfo")['board_width']) BOARD_HEIGHT = int(get_config("MainInfo")['board_height']) return MAX_HIT_POINTS, MAX_ANGLE, MAX_VELOCITY, BOARD_WIDTH, BOARD_HEIGHT
def update_variables(): Zombie.BOARD_HEIGHT = int(get_config("MainInfo")['board_height']) Zombie.BOARD_WIDTH = int(get_config("MainInfo")['board_width']) Zombie.LIGHT_SIZE = int(get_config("MainInfo")['light_size']) Zombie.DT = int(get_config("MainInfo")['dt']) Zombie.ANGLE = float(get_config("MainInfo")['max_angle']) Zombie.START_POSITIONS = calculate_start_positions( Zombie.BOARD_WIDTH, Zombie.BOARD_HEIGHT, Zombie.ANGLE)
def __init__(self, device, agent_type): super().__init__(strategy=EpsilonGreedyStrategy(), agent_type=agent_type) # use the 'EpsilonGreedyStrategy' strategy # load values from config self.LIGHT_SIZE = int(get_config("MainInfo")['light_size']) ddqn_info = get_config('DdqnAgentInfo') self.batch_size = int(ddqn_info['batch_size']) self.gamma = float(ddqn_info['gamma']) self.memory_size = int(ddqn_info['memory_size']) self.target_update = int(ddqn_info['target_update']) self.lr = float(ddqn_info['lr']) # init networks self.num_actions, self.target_net, self.policy_net = create_networks(device, agent_type, self.possible_actions) # other fields self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=self.lr) self.memory = ReplayMemory() self.current_step = 0 self.device = device
def __init__(self, device, agent_type): BasicMCTSAgent.MAX_HIT_POINTS, BasicMCTSAgent.MAX_ANGLE, BasicMCTSAgent.MAX_VELOCITY, BasicMCTSAgent.BOARD_WIDTH, BasicMCTSAgent.BOARD_HEIGHT, BasicMCTSAgent.C = update_variables() super().__init__(EpsilonGreedyStrategy(), agent_type) self.possible_actions = list(range(Game.BOARD_HEIGHT)) if self.agent_type == 'zombie' else list(range(Game.BOARD_HEIGHT * Game.BOARD_WIDTH)) self.root = Node([], self.possible_actions) self.temporary_root = self.root # TODO - change its name to something like: real world state-node self.current_step = 0 self.simulation_reward = 0 self.simulation_num = int(get_config("TreeAgentInfo")['simulation_num']) # number of simulations in the simulation phase self.simulation_depth = int(get_config("TreeAgentInfo")['simulation_depth']) # number of times to expand a node in single simulation self.episode_reward = 0 self.tree_depth = 0 self.pool = mp.Pool(mp.cpu_count()) main_info = get_config('MainInfo') self.steps_per_episodes = int(main_info['zombies_per_episode']) + int(main_info['board_width']) self.total_episodes = int(main_info['num_train_episodes']) + int(main_info['num_test_episodes'])
class RandomAgent(Agent): BOARD_WIDTH = int(get_config("MainInfo")['board_width']) BOARD_HEIGHT = int(get_config("MainInfo")['board_height']) def __init__(self, device, agent_type): RandomAgent.BOARD_WIDTH, RandomAgent.BOARD_HEIGHT = update_variables() super(RandomAgent, self).__init__(EpsilonGreedyStrategy(), agent_type) self.current_step = 0 self.possible_actions = list(range(RandomAgent.BOARD_HEIGHT)) if self.agent_type == 'zombie' else list( range(RandomAgent.BOARD_HEIGHT * RandomAgent.BOARD_WIDTH)) def select_action(self, state,alive_zombies): rate = self.strategy.get_exploration_rate(current_step=self.current_step) self.current_step += 1 return random.sample(self.possible_actions, 1)[0], rate, self.current_step def learn(self, state, action, next_state, reward): pass def reset(self): pass
def update_variables(): MAX_HIT_POINTS = int(get_config("MainInfo")['max_hit_points']) MAX_ANGLE = int(get_config("MainInfo")['max_angle']) MAX_VELOCITY = int(get_config("MainInfo")['max_velocity']) BOARD_WIDTH = int(get_config("MainInfo")['board_width']) BOARD_HEIGHT = int(get_config("MainInfo")['board_height']) C = float(get_config("TreeAgentInfo")['exploration_const']) return MAX_HIT_POINTS, MAX_ANGLE, MAX_VELOCITY, BOARD_WIDTH, BOARD_HEIGHT, C
def create_networks(device, agent_type, possible_actions): main_info = get_config('MainInfo') h = int(main_info['board_height']) w = int(main_info['board_width']) # create networks neurons_number = h * w if agent_type == 'light' else h * w / 2 input_size = 2 * h * w if agent_type == 'light' else h * w # the light agents get extra information num_actions = len(possible_actions) target_net = DQN(input_size, num_actions, neurons_number).to(device) policy_net = DQN(input_size, num_actions, neurons_number).to(device) # set up target network as the same weights target_net.load_state_dict(policy_net.state_dict()) target_net.eval() return num_actions, target_net, policy_net
def __init__(self, device, agent_zombie, agent_light): Game.MAX_HIT_POINTS, Game.MAX_ANGLE, Game.MAX_VELOCITY, Game.BOARD_WIDTH, Game.BOARD_HEIGHT = update_variables( ) main_info = get_config("MainInfo") self.grid = GameGrid() self.light_size = int(main_info['light_size']) self.max_angle = int(main_info['max_angle']) self.start_positions = self.calculate_start_positions() if len(self.start_positions) < 2: print("The angle is too wide!") sys.exit() # set interactive mode self.interactive_mode = main_info.getboolean('interactive_mode') if self.interactive_mode: pygame.init() pygame.display.set_caption('pickleking') self.display_width = int(main_info['display_width']) self.display_height = int(main_info['display_height']) self.game_display = pygame.display.set_mode( (self.display_width, self.display_height)) self.zombie_image, self.light_image, self.grid_image = self.set_up( ) self.clock = pygame.time.Clock() else: os.environ[ "SDL_VIDEODRIVER"] = "dummy" # not really necessary, here to make sure nothing will pop-up # set our agents self.agent_zombie = agent_zombie(device, 'zombie') self.agent_light = agent_light(device, 'light') # load main info self.steps_per_episodes = int(main_info['zombies_per_episode']) + int( main_info['board_width']) - 1 self.zombies_per_episode = int(main_info['zombies_per_episode']) self.check_point = int(main_info['check_point']) self.total_episodes = int(main_info['num_train_episodes']) + int( main_info['num_test_episodes']) # other fields self.max_hit_points = Game.MAX_HIT_POINTS self.current_time = 0 self.alive_zombies = [] # list of the currently alive zombies self.all_zombies = [] # list of all zombies (from all time) self.max_velocity = int(main_info['max_velocity']) self.dt = int(main_info['dt']) self.device = device self.current_screen = None self.done = False self.writer = SummaryWriter(log_dir='../runs')
class Agent: BOARD_WIDTH = int(get_config("MainInfo")['board_width']) def __init__(self, strategy, agent_type): self.agent_type = agent_type self.strategy = strategy @abstractmethod def select_action(self, state): raise NotImplementedError @abstractmethod def learn(self, state, action, next_state, reward): raise NotImplementedError def reset(self): raise NotImplementedError
class Zombie: @staticmethod def update_variables(): Zombie.BOARD_HEIGHT = int(get_config("MainInfo")['board_height']) Zombie.BOARD_WIDTH = int(get_config("MainInfo")['board_width']) Zombie.LIGHT_SIZE = int(get_config("MainInfo")['light_size']) Zombie.DT = int(get_config("MainInfo")['dt']) Zombie.ANGLE = float(get_config("MainInfo")['max_angle']) Zombie.START_POSITIONS = calculate_start_positions( Zombie.BOARD_WIDTH, Zombie.BOARD_HEIGHT, Zombie.ANGLE) # static field ZOMBIE_NUM = 1 BOARD_HEIGHT = int(get_config("MainInfo")['board_height']) BOARD_WIDTH = int(get_config("MainInfo")['board_width']) LIGHT_SIZE = int(get_config("MainInfo")['light_size']) MAX_HIT_POINT = int(get_config("MainInfo")['max_hit_points']) DT = int(get_config("MainInfo")['dt']) ANGLE = float(get_config("MainInfo")['max_angle']) START_POSITIONS = calculate_start_positions(BOARD_WIDTH, BOARD_HEIGHT, ANGLE) def __init__(self, angle, velocity, state): """ :param id: int :param angle: float, radians :param velocity: float, unit/sec :param y: float :param env: env_manager - when creating a zombie, we must specify in which env_manager he is born """ self.id = Zombie.set_id() self.angle = angle self.velocity = velocity self.hit_points = 0 # 1 for alive, 0 for dead # x,y are the real coordinates of the zombie self.x = 0 # every zombie starts at the left side self.v_x = self.velocity * np.cos(self.angle) self.y = Zombie.START_POSITIONS[ state] / Zombie.BOARD_WIDTH # every zombie starts in an arbitrary positions by some distribution self.v_y = self.velocity * np.sin(self.angle) self.current_state = state # self.history = [(self.env.current_time, int(self.current_state[0]))] # tuples of (timestamp, pos) self.heal_epsilon = HEAL_EPSILON self.just_born = True @staticmethod def set_id(): new_zombie_id = Zombie.ZOMBIE_NUM Zombie.ZOMBIE_NUM += 1 return new_zombie_id @staticmethod def reset_id(): Zombie.ZOMBIE_NUM = 1 def update_hit_points(self, light_action): light_x = int(np.mod(light_action, Zombie.BOARD_WIDTH)) light_y = int(light_action / Zombie.BOARD_WIDTH) # include only the start (the end is outside the light) if (light_x <= self.x < (light_x + Zombie.LIGHT_SIZE)) & (light_y <= self.y < (light_y + Zombie.LIGHT_SIZE)): # in a case of an hit, increase the zombie's hit points by 1 if self.hit_points < self.MAX_HIT_POINT: self.hit_points += 1 #else: # # heal the zombie by (1-epsilon) # self.hit_points *= (1 - self.heal_epsilon) def move(self, light_action): """ 1. punish/heal the zombie by the position of the light 2. update current pos of zombie by its' angle and velocity 3. append history """ if self.just_born: # if the zombie just born, don't punish him, wait until the next turn to avoid double punishment # TODO - checking if it is necessary # new idea: if the zombie just born, punish him without moving him forward self.just_born = False else: # next step, move forward and punish self.x += self.v_x * Zombie.DT self.y += self.v_y * Zombie.DT self.current_state = self.x + self.y * Zombie.BOARD_WIDTH # hit/heal the zombie self.update_hit_points(light_action)
class Game: MAX_HIT_POINTS = int(get_config("MainInfo")['max_hit_points']) MAX_ANGLE = int(get_config("MainInfo")['max_angle']) * math.pi / 180 MAX_VELOCITY = int(get_config("MainInfo")['max_velocity']) BOARD_WIDTH = int(get_config("MainInfo")['board_width']) BOARD_HEIGHT = int(get_config("MainInfo")['board_height']) def __init__(self, device, agent_zombie, agent_light): Game.MAX_HIT_POINTS, Game.MAX_ANGLE, Game.MAX_VELOCITY, Game.BOARD_WIDTH, Game.BOARD_HEIGHT = update_variables( ) main_info = get_config("MainInfo") self.grid = GameGrid() self.light_size = int(main_info['light_size']) self.max_angle = int(main_info['max_angle']) self.start_positions = self.calculate_start_positions() if len(self.start_positions) < 2: print("The angle is too wide!") sys.exit() # set interactive mode self.interactive_mode = main_info.getboolean('interactive_mode') if self.interactive_mode: pygame.init() pygame.display.set_caption('pickleking') self.display_width = int(main_info['display_width']) self.display_height = int(main_info['display_height']) self.game_display = pygame.display.set_mode( (self.display_width, self.display_height)) self.zombie_image, self.light_image, self.grid_image = self.set_up( ) self.clock = pygame.time.Clock() else: os.environ[ "SDL_VIDEODRIVER"] = "dummy" # not really necessary, here to make sure nothing will pop-up # set our agents self.agent_zombie = agent_zombie(device, 'zombie') self.agent_light = agent_light(device, 'light') # load main info self.steps_per_episodes = int(main_info['zombies_per_episode']) + int( main_info['board_width']) - 1 self.zombies_per_episode = int(main_info['zombies_per_episode']) self.check_point = int(main_info['check_point']) self.total_episodes = int(main_info['num_train_episodes']) + int( main_info['num_test_episodes']) # other fields self.max_hit_points = Game.MAX_HIT_POINTS self.current_time = 0 self.alive_zombies = [] # list of the currently alive zombies self.all_zombies = [] # list of all zombies (from all time) self.max_velocity = int(main_info['max_velocity']) self.dt = int(main_info['dt']) self.device = device self.current_screen = None self.done = False self.writer = SummaryWriter(log_dir='../runs') def calculate_start_positions(self): zombie_home_length = int(self.grid.get_height() - 2 * self.grid.get_width() * math.tan(self.max_angle * math.pi / 180)) zombie_home_start_pos = int( self.grid.get_height() - zombie_home_length - self.grid.get_width() * math.tan(self.max_angle * math.pi / 180)) # m-n-b return np.multiply( list( range(zombie_home_start_pos, zombie_home_start_pos + zombie_home_length)), self.grid.get_width()) def reset(self): self.current_time = 0 Zombie.reset_id() self.alive_zombies = [] # list of the currently alive zombies self.all_zombies = [] # list of all zombies (from all time) self.current_screen = None self.agent_light.reset() self.agent_zombie.reset() def play_zero_sum_game(self, path): episodes_dict = {'episode_rewards': [], 'episode_durations': []} steps_dict_light = {'epsilon': [], 'action': [], 'step': []} steps_dict_zombie = {'epsilon': [], 'action': [], 'step': []} for episode in range(self.total_episodes): self.reset() state_zombie, state_light = self.get_state() zombie_master_reward = 0 episode_start_time = time.time() for time_step in count(): action_zombie, rate, current_step = self.agent_zombie.select_action( state_zombie, self.alive_zombies, self.writer) action_light, rate, current_step = self.agent_light.select_action( state_light, self.alive_zombies, self.writer) # update dict steps_dict_light['epsilon'].append(rate) steps_dict_light['action'].append( int(action_light // self.grid.get_width())) steps_dict_light['step'].append(time_step) steps_dict_zombie['epsilon'].append(rate) steps_dict_zombie['action'].append(int(action_zombie)) steps_dict_zombie['step'].append(time_step) reward = self.apply_actions(action_zombie, action_light) if reward > 0: zombie_master_reward += reward next_state_zombie, next_state_light = self.get_state() self.agent_zombie.learn(state_zombie.unsqueeze(0), action_zombie, next_state_zombie.unsqueeze(0), reward, self.writer) self.agent_light.learn( state_light.unsqueeze(0), action_light, next_state_light.unsqueeze(0), reward * -1, self.writer) # agent_light gets the opposite state_zombie, state_light = next_state_zombie, next_state_light if self.done: # if the episode is done, store it's reward and plot the moving average episodes_dict['episode_rewards'].append( zombie_master_reward) episodes_dict['episode_durations'].append( time.time() - episode_start_time) break # plotting the moving average if episode % self.check_point == 0: plot_progress(path, episodes_dict, self.check_point) plot_progress(path, episodes_dict, self.check_point) return episodes_dict, steps_dict_light, steps_dict_zombie def action_space(self): light_action_space = self.grid.get_height() * self.grid.get_width() zombie_action_space = len(self.start_positions) return light_action_space, zombie_action_space def apply_actions(self, zombie_action, light_action): """ This method steps the game forward one step and shoots a bubble at the given angle. Parameters ---------- zombie_action : int The action is an angle between 0 and 180 degrees, that decides the direction of the bubble. light_action Returns ------- ob, reward, episode_over, info : tuple ob (object) : an environment-specific object representing the state of the environment. reward (float) : amount of reward achieved by the previous action. episode_over (bool) : whether it's time to reset the environment again. """ self.current_time += 1 # update display in case of interactive mode if self.interactive_mode: self.update(light_action) # add new zombie if len(self.all_zombies) < self.zombies_per_episode: new_zombie = Game.create_zombie(zombie_action) self.alive_zombies.append(new_zombie) self.all_zombies.append(new_zombie) # move all zombies one step and calc reward reward, self.alive_zombies = Game.calc_reward_and_move_zombies( self.alive_zombies, light_action) self.done = self.current_time > self.steps_per_episodes # TODO - maybe pick another terminal condition of the game and assign it to done (as True/False) return reward @staticmethod def calc_reward_and_move_zombies(alive_zombies, light_action): """ moving all zombies while aggregating and outputting current reward :return all alive zombies (haven't step out of the grid) """ # temp list for later be equal to self.alive_zombies list, it's here just for the for loop (NECESSARY!) new_alive_zombies = list(copy.deepcopy(alive_zombies)) reward = 0 indices_to_keep = list(range(len(new_alive_zombies))) for index, zombie in enumerate(new_alive_zombies): zombie.move(light_action) if 0 >= zombie.y or zombie.y >= Game.BOARD_HEIGHT: indices_to_keep.remove(index) elif zombie.x >= Game.BOARD_WIDTH: if Game.keep_alive( zombie.hit_points ): # decide whether to keep the zombie alive, if so, give the zombie master reward reward += 1 else: reward -= 1 indices_to_keep.remove( index) # deleting a zombie that reached the border return reward, list(np.array(new_alive_zombies)[indices_to_keep]) @staticmethod def keep_alive(h): if h >= Game.MAX_HIT_POINTS: # if the zombie sustained a lot of damaged return False else: # else decide by the sine function -> if the result is greater than 0.5 -> keep alive, else -> kill it (no reward for the zombie master) """ the idea is: if the hit points is close to 3 then the result is close to 1 -> -> there is small chance for keeping him alive and therefor rewarding the zombie with positive reward For example, if zombie hit points is 3 - > the result is 1 -> always return False (the random will never be greater than 1) in the past sin(h * pi / 2 * self.max_hit_points) < random.random() """ #return np.power(h / Game.MAX_HIT_POINTS, 1 / 3) < random.random() return True def get_state(self): zombie_grid = self.grid.get_values() zombie_grid = zombie_grid.astype(np.float32) zombie_grid.fill(0) health_grid = copy.deepcopy(zombie_grid) for i in self.alive_zombies: zombie_grid[int(i.y), int(i.x)] = 1 health_grid[int(i.y), int(i.x)] = i.hit_points return torch.from_numpy(zombie_grid).flatten(), torch.from_numpy( np.concatenate((zombie_grid, health_grid))).flatten() def get_pygame_window(self): return pygame.surfarray.array3d(pygame.display.get_surface()) @staticmethod def create_zombie(position): if Game.MAX_ANGLE == 0: angle = Game.MAX_ANGLE else: angle = random.uniform(-Game.MAX_ANGLE, Game.MAX_ANGLE) return Zombie(angle, Game.MAX_VELOCITY, position) def set_up(self): # create the gameUtils directory if doesn't exist path = os.path.join( os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)), "gameUtils") if not os.path.exists(path): os.mkdir(path) os.chmod(path, 777) # get images zombie_image = Image.open(os.path.join(path, 'zombie.png')) light_image = Image.open(os.path.join(path, 'light.png')) # resize (light_image is doubled for 2x2 cells) zombie_image = zombie_image.resize( (int(self.display_width / self.grid.get_width()), int(self.display_height / self.grid.get_height())), 0) light_image = light_image.resize( (int(self.display_width / self.grid.get_width()) * self.light_size, int(self.display_height / self.grid.get_height()) * self.light_size), 0) # save zombie_image.save(os.path.join(path, 'zombie_image.png')) light_image.save(os.path.join(path, 'light_image.png')) # draw and save the grid self.draw_grid() # return the images in the pygame format return pygame.image.load(os.path.join( path, 'zombie_image.PNG')), pygame.image.load( os.path.join(path, 'light_image.PNG')), pygame.image.load( os.path.join(path, 'grid.jpeg')) def update(self, light_action): event = pygame.event.get() self.game_display.blit(self.grid_image, (0, 0)) x_adjustment = int(self.display_width / self.grid.get_width()) y_adjustment = int(self.display_height / self.grid.get_height()) self.game_display.blit( self.light_image, (int(np.mod(light_action, self.grid.get_width()) * x_adjustment), int(light_action / self.grid.get_width()) * y_adjustment)) for z in self.alive_zombies: self.game_display.blit(self.zombie_image, (z.x * x_adjustment, z.y * y_adjustment)) pygame.display.update( ) # better than pygame.display.flip because it can update by param, and not the whole window self.clock.tick(30) # the number of frames per second def draw_grid(self): x_size = self.display_width / self.grid.get_width( ) # x size of the grid block y_size = self.display_height / self.grid.get_height( ) # y size of the grid block for x in range(self.display_width): for y in range(self.display_height): rect = pygame.Rect(x * x_size, y * y_size, x_size, y_size) pygame.draw.rect(self.game_display, (255, 255, 255), rect, 1) # draw the start line y_adjustment = int(self.display_height / self.grid.get_height()) pygame.draw.rect(self.game_display, (0, 200, 50), [ 0, int((min(self.start_positions))) / self.grid.get_width() * y_adjustment, 10, int((max(self.start_positions) + np.diff(self.start_positions)[0] - min(self.start_positions))) / self.grid.get_width() * y_adjustment ]) path = os.path.join( os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)), "gameUtils") pygame.image.save(self.game_display, os.path.join(path, 'grid.jpeg')) def end_game(self): pygame.quit() quit() def just_starting(self): # current screen is set to none in the beginning and in the end of an episode return self.current_screen is None def get_state_old(self): if self.just_starting() or self.done: self.current_screen = self.get_processed_screen() black_screen = torch.zeros_like(self.current_screen) return black_screen else: s1 = self.current_screen s2 = self.get_processed_screen() self.current_screen = s2 return s2 - s1 def get_processed_screen(self): screen = self.get_pygame_window().transpose( (2, 0, 1)) # PyTorch expects CHW screen = self.crop_screen(screen) return self.transform_screen_data(screen) def crop_screen(self, screen): screen_height = screen.shape[1] # Strip off top and bottom top = int(screen_height * 0) bottom = int(screen_height * 1) screen = screen[:, top:bottom, :] return screen def transform_screen_data(self, screen): # Convert to float, rescale, convert to tensor screen = np.ascontiguousarray(screen, dtype=np.float32) / 255 screen = torch.from_numpy(screen) # Use torchvision package to compose image transforms resize = T.Compose([T.ToPILImage(), T.Resize((60, 30)), T.ToTensor()]) return resize(screen).unsqueeze(0).to( self.device) # add a batch dimension (BCHW)
def update_variables(): BOARD_WIDTH = int(get_config("MainInfo")['board_width']) BOARD_HEIGHT = int(get_config("MainInfo")['board_height']) return BOARD_WIDTH, BOARD_HEIGHT
class BasicMCTSAgent(Agent): MAX_HIT_POINTS = int(get_config("MainInfo")['max_hit_points']) MAX_ANGLE = int(get_config("MainInfo")['max_angle']) MAX_VELOCITY = int(get_config("MainInfo")['max_velocity']) BOARD_WIDTH = int(get_config("MainInfo")['board_width']) BOARD_HEIGHT = int(get_config("MainInfo")['board_height']) C = float(get_config("TreeAgentInfo")['exploration_const']) def __init__(self, device, agent_type): BasicMCTSAgent.MAX_HIT_POINTS, BasicMCTSAgent.MAX_ANGLE, BasicMCTSAgent.MAX_VELOCITY, BasicMCTSAgent.BOARD_WIDTH, BasicMCTSAgent.BOARD_HEIGHT, BasicMCTSAgent.C = update_variables() super().__init__(EpsilonGreedyStrategy(), agent_type) self.possible_actions = list(range(Game.BOARD_HEIGHT)) if self.agent_type == 'zombie' else list(range(Game.BOARD_HEIGHT * Game.BOARD_WIDTH)) self.root = Node([], self.possible_actions) self.temporary_root = self.root # TODO - change its name to something like: real world state-node self.current_step = 0 self.simulation_reward = 0 self.simulation_num = int(get_config("TreeAgentInfo")['simulation_num']) # number of simulations in the simulation phase self.simulation_depth = int(get_config("TreeAgentInfo")['simulation_depth']) # number of times to expand a node in single simulation self.episode_reward = 0 self.tree_depth = 0 self.pool = mp.Pool(mp.cpu_count()) main_info = get_config('MainInfo') self.steps_per_episodes = int(main_info['zombies_per_episode']) + int(main_info['board_width']) self.total_episodes = int(main_info['num_train_episodes']) + int(main_info['num_test_episodes']) def select_action(self, state,alive_zombies): rate = self.strategy.get_exploration_rate(current_step=self.current_step) self.current_step += 1 # selection phase selected_child = self.selection() assert selected_child.num_children == 0 or selected_child.is_terminal # expansion phase, here we selecting the action from which we will simulate the selected_child play-out # keep in mind that in this phase we expand a node that is NOT the temporary root, the expansion action doesn't relate to the real action we are taking # action = self.expansion_all_children(selected_child) if selected_child == self.root: # if the selected child is root, expand all its children expanded_child = self.expansion_all_children(selected_child) elif selected_child.parent is not None and selected_child.parent.num_children != len(self.possible_actions): # if the selected child is missing a brother (we managed to choose him thanks to some real action), expand all its brothers and choose one expanded_child = self.expansion_all_children(selected_child.parent) elif selected_child.visits == 0: # if we never visited that node, start roll-out from there expanded_child = selected_child else: # in case the node is a leaf but we already been there expanded_child = self.expansion_all_children(selected_child) assert expanded_child.num_children == 0 assert selected_child.num_children == 0 or selected_child.num_children == len(self.possible_actions) # simulation phase self.simulation(expanded_child) # select next action action = self.select_expansion_action(self.temporary_root, self.possible_actions) self.expansion_all_children(self.temporary_root) self.temporary_root = self.temporary_root.children[action] assert self.temporary_root.num_children == len(self.possible_actions) or self.temporary_root.num_children == 0 # self.PrintTree() # when the game ends - close the pool to avoid memory explosion if self.current_step == self.total_episodes * self.steps_per_episodes: self.pool.close() self.pool.join() return action, rate, self.current_step def learn(self, _, action, __, reward): # back-propagation phase, start back-propagating from the current real world node # self.episode_reward += reward # self.back_propagation(self.temporary_root, reward, self.root) pass def selection(self): """ The selection Phase in the MCTS algorithm. selects leaf by following the UCT algorithm :return: """ selected_child = self.temporary_root # Check if child nodes exist. if selected_child.num_children > 0: has_child = True else: has_child = False while has_child: # selecting the best child unless there is unexpanded child in the way - select_child method is required! selected_child = self.select_child(selected_child) if selected_child.num_children == 0 or selected_child.is_terminal: has_child = False return selected_child def select_child(self, node: Node) -> Node: """ Given a node, selects a random unvisited child node. Or if all children are visited, selects the node with greatest UTC value. @note: we must start the selection from here - imagine that a child was expanded, immediately we expanded all its brothers too. in the next turn we might want to start simulating from one of its brothers instead of picking always him with the 'select_best_child' (after we evaluated one of the brothers with 'eval_utc' method, that brother would always be selected via 'select_best_child' method) :param node: node from which to select child node from. :return: The selected child """ if node.num_children == 0: return node # check if 'node' has any unexpanded nodes - which is any None value in children dictionary OR there is a child but it's simulated not_visited_actions = [] assert node.num_children == 0 or node.num_children == len(self.possible_actions) for action, child in node.children.items(): # search for children that never rolled out (simulation started from them) if child.visits == 0: not_visited_actions.append(action) # chosen child from one of the unexpanded children - if there are any if len(not_visited_actions) > 0: action = random.sample(not_visited_actions, 1)[0] return node.children[action] return BasicMCTSAgent.select_best_child(node) @staticmethod def select_best_child(node): """ Selects the best child of a node :param node: Node to select one of its children :return: highest UCT valued child """ selected_child = node if node.num_children == 0: return node max_weight = 0.0 possible_children = [] for child in list(filter(None, node.children.values())): weight = child.uct if len(possible_children) == 0: possible_children.append(child) max_weight = weight elif weight == max_weight: possible_children.append(child) elif weight > max_weight: possible_children = [child] max_weight = weight if len(possible_children) > 0: selected_child = random.sample(possible_children, 1)[0] return selected_child def expansion_all_children(self, leaf): self.eval_children(leaf, self.possible_actions) return random.sample(list(leaf.children.values()), 1)[0] def expansion_one_child(self, leaf): action = self.select_expansion_action(leaf, self.possible_actions) self.eval_children(leaf, [action]) return action def eval_children(self, node, actions): """ Evaluates all the possible children states given a node state :param node: node from which to evaluate children. :param actions: list of all possible actions to choose from :return: returns the possible children Nodes """ assert node.num_children == len(self.possible_actions) or node.num_children == 0 if node.num_children == 0: for action in actions: _, alive_zombies = BasicMCTSAgent.simulate_action(node.state, self.agent_type, action) node.add_child(alive_zombies, action) return node.children def select_expansion_action(self, node, possible_actions): """ Wisely selects a child node. :param node: the selected node to expand child from :param possible_actions: list of all possible actions to choose from :return: the selected action """ selected_child = self.select_best_child(node) assert selected_child is not None selected_action = None if selected_child == node: selected_action = random.sample(self.possible_actions, 1)[0] else: for key, value in node.children.items(): if value == selected_child: selected_action = key assert selected_action is not None return selected_action @staticmethod def select_simulation_action(alive_zombies, possible_actions): # Randomly selects a child node. i = random.sample(possible_actions, 1)[0] return i def simulation(self, selected_child): """ Simulating states from previous states and actions This phase happens right after we've chose the expansion, and from the selected child with action :param selected_child: node from which to perform simulation. :return: """ # Perform simulation. list_of_objects = [] simulation_state = selected_child.state for _ in range(self.simulation_num): obj = CostlySimulation(self.simulation_depth, simulation_state, self.possible_actions, self.agent_type) list_of_objects.append(obj) list_of_results = self.pool.map(BasicMCTSAgent.worker, ((obj, BasicMCTSAgent.BOARD_HEIGHT, BasicMCTSAgent.BOARD_WIDTH) for obj in list_of_objects)) assert np.max(list_of_results) <= self.simulation_depth average_total_reward = np.average(list_of_results) if self.agent_type == 'zombie' else -1 * np.average(list_of_results) # back-prop from the expanded child (the child of the selected node) BasicMCTSAgent.back_propagation(selected_child, average_total_reward, self.root) @staticmethod def worker(arg): return arg[0].costly_simulation(arg[1], arg[2]) @staticmethod def simulate_action(alive_zombies, agent_type, action): """ Simulating future states by 'actions' of an agent :param alive_zombies: all alive zombies at the real world :param agent_type: 'zombie' or 'light' agent :param action: array containing all the actions to simulate :return: total reward of the simulation """ new_alive_zombies = list(copy.deepcopy(alive_zombies)) # make a copy of all zombies - we do not want to make any act in real world # set action and light agents actions if agent_type == 'zombie': zombie_action = action # random sample len(actions) times from light-agent actions-space light_action = 0 # np.random.randint(0, BasicMCTSAgent.BOARD_HEIGHT * BasicMCTSAgent.BOARD_WIDTH) else: light_action = action # sample n times from zombie-agent actions-space zombie_action = np.random.randint(0, BasicMCTSAgent.BOARD_HEIGHT) # simulate and aggregate reward total_reward = 0 new_zombie = Game.create_zombie(zombie_action) new_alive_zombies.append(new_zombie) reward, final_alive_zombies = Game.calc_reward_and_move_zombies(new_alive_zombies, light_action) total_reward += reward return total_reward, final_alive_zombies @staticmethod def back_propagation(node, result, root): current_node = node # Update node's weight. BasicMCTSAgent.eval_utc(current_node, result) # keep updating until the desired root while current_node.level != root.level: # Update parent node's weight. current_node = current_node.parent BasicMCTSAgent.eval_utc(current_node, result) @staticmethod def eval_utc(node, result): node.wins += result node.visits += 1 node.uct = node.wins / node.visits + BasicMCTSAgent.evaluate_exploration(node) @staticmethod def evaluate_exploration(node): n = node.visits if node.parent is None: t = node.visits else: t = node.parent.visits # avoid log of 0 with: 't or 1' return BasicMCTSAgent.C * np.sqrt(np.log(t or 1) / n) @staticmethod def has_parent(node): if node.parent is None: return False else: return True def reset(self): # BasicMCTSAgent.back_propagation(self.temporary_root, self.episode_reward) self.temporary_root = self.root # self.episode_reward = 0 # if self.agent_type == 'zombie': # self.PrintTree() def print_tree(self): """ Prints the tree to file. :return: """ f = open(os.path.join(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)), 'Tree.txt'), 'w') node = self.root self.print_node(f, node, "") f.close() def print_node(self, file, node, indent): """ Prints the tree node and its details to file. :param file: file to write into :param node: node to print. :param indent: Indent character. :return: """ file.write(indent) file.write("|-") indent += "| " string = str(node.level) + " (" string += "W: " + str(node.wins) + ", N: " + str(node.visits) + ", UCT: " + str(node.uct) + ") \n" file.write(string) for child in list(filter(None, list(node.children.values()))): self.print_node(file, child, indent)