def __init__(self, env, model, max_walkers: int = 100, balance: float = 1., time_horizon: int = 15, reward_limit: float = None, max_samples: int = None, render_every: int = 1e10, custom_reward: Callable = None, custom_end: Callable = None, dt_mean: float = None, dt_std: float = None, accumulate_rewards: bool = True, keep_best: bool = True, min_dt: int = 1): """ :param env: Environment that will be sampled. :param model: Model used for sampling actions from observations. :param max_walkers: Number of walkers that the swarm will use :param balance: Balance coefficient for the virtual reward formula. :param reward_limit: Maximum reward that can be reached before stopping the swarm. :param max_samples: Maximum number of time the Swarm can sample the environment befors stopping. :param render_every: Number of iterations that will be performed before printing the Swarm status. """ self.max_walkers = max_walkers self.time_horizon = time_horizon self.max_samples = max_samples _max_samples = max_samples if max_samples is not None else 1e10 self._max_samples_step = min(_max_samples, max_walkers * time_horizon) super(FractalMC, self).__init__(env=env, model=model, n_walkers=self.max_walkers, balance=balance, reward_limit=reward_limit, samples_limit=self._max_samples_step, render_every=render_every, custom_reward=custom_reward, custom_end=custom_end, dt_mean=dt_mean, dt_std=dt_std, keep_best=keep_best, accumulate_rewards=accumulate_rewards, min_dt=min_dt) self.init_ids = np.zeros(self.n_walkers).astype(int) self._save_steps = [] self._agent_reward = 0 self._last_action = None self.tree = DynamicTree()
def __init__(self, env, model, n_walkers: int = 100, balance: float = 1., reward_limit: float = None, samples_limit: int = None, render_every: int = 1e10, save_data: bool = True, accumulate_rewards: bool = True, dt_mean: float = None, dt_std: float = None, custom_reward: Callable = None, custom_end: Callable = None, keep_best: bool = False, min_dt: int = 1): """ :param env: Environment that will be sampled. :param model: Model used for sampling actions from observations. :param n_walkers: Number of walkers that the swarm will use :param balance: Balance coefficient for the virtual reward formula. :param reward_limit: Maximum reward that can be reached before stopping the swarm. :param samples_limit: Maximum number of time the Swarm can sample the environment befors stopping. :param render_every: Number of iterations that will be performed before printing the Swarm status. """ super(SwarmWave, self).__init__(env=env, model=model, n_walkers=n_walkers, balance=balance, reward_limit=reward_limit, samples_limit=samples_limit, render_every=render_every, accumulate_rewards=accumulate_rewards, dt_mean=dt_mean, dt_std=dt_std, custom_end=custom_end, custom_reward=custom_reward, keep_best=keep_best, min_dt=min_dt) self.save_data = save_data self.old_ids = np.zeros(self.n_walkers) self.tree = DynamicTree() if save_data else None self._current_index = None self._curr_states = [] self._curr_actions = [] self._curr_dts = [] self._current_ix = -1
class FractalMC(Swarm): def __init__(self, env, model, n_walkers: int = 100, balance: float = 1., reward_limit: float = None, samples_limit: int = None, render_every: int = 1e10, accumulate_rewards: bool = True, dt_mean: float = None, dt_std: float = None, min_dt: int = 1, custom_reward: Callable = None, custom_end: Callable = None, process_obs: Callable = None, custom_skipframe: Callable = None, keep_best: bool = False, can_win: bool = False, skip_initial_frames: int = 0, max_samples_step: int = None, time_horizon: int = 40, min_horizon: int = 1, update_parameters: bool = False): """ :param env: Environment that will be sampled. :param model: Model used for sampling actions from observations. :param n_walkers: Number of walkers that the swarm will use :param balance: Balance coefficient for the virtual reward formula. :param reward_limit: Maximum reward that can be reached before stopping the swarm. :param samples_limit: Maximum number of time the Swarm can sample the environment befors stopping. :param render_every: Number of iterations that will be performed before printing the Swarm status. :param accumulate_rewards: Use the accumulated reward when scoring the walkers. False to use instantaneous reward. :param dt_mean: Mean skipframe used for exploring. :param dt_std: Standard deviation for the skipframe. Sampled from a normal distribution. :param min_dt: Minimum skipframe to be used by the swarm. :param custom_reward: Callable for calculating a custom reward function. :param custom_end: Callable for calculating custom boundary conditions. :param process_obs: Callable for doing custom observation processing. :param custom_skipframe: Callable for sampling the skipframe values of the walkers. :param keep_best: Keep track of the best accumulated reward found so far. :param can_win: If the game can be won when a given score is achieved, set to True. Meant to be used with Atari games like Boxing, Pong, IceHockey, etc. :param skip_initial_frames: Skip n frame when the game begins. :param max_samples_step: Maximum number of steps to be sampled per action. :param time_horizon: Desired path length allowed when calculating a step. :param min_horizon: Minimum path length allowed when calculating a step. :param update_parameters: Enable non-linear feedback loops to adjust internal params. """ self.skip_initial_frames = skip_initial_frames self.max_walkers = n_walkers self.time_horizon = time_horizon self.max_samples = max_samples_step self.min_horizon = min_horizon _max_samples = max_samples_step if max_samples_step is not None else 1e10 samples_limit = samples_limit if samples_limit is not None else 1e10 self._max_step_total = max(_max_samples, samples_limit) self._max_samples_step = min(_max_samples, n_walkers * time_horizon) super(FractalMC, self).__init__(env=env, model=model, n_walkers=self.max_walkers, balance=balance, reward_limit=reward_limit, samples_limit=self._max_samples_step, render_every=render_every, custom_reward=custom_reward, custom_end=custom_end, dt_mean=dt_mean, dt_std=dt_std, keep_best=keep_best, accumulate_rewards=accumulate_rewards, min_dt=min_dt, can_win=can_win, process_obs=process_obs, custom_skipframe=custom_skipframe) self.init_ids = np.zeros(self.n_walkers).astype(int) self._update_parameters = update_parameters self._save_steps = [] self._agent_reward = 0 self._last_action = None self.tree = DynamicTree() @property def init_actions(self): return self.data.get_actions(self.init_ids) def init_swarm(self, state: np.ndarray = None, obs: np.ndarray = None): super(FractalMC, self).init_swarm(state=state, obs=obs) self.init_ids = np.zeros(self.n_walkers).astype(int) def clone(self): super(FractalMC, self).clone() if self._clone_idx is None: return self.init_ids = np.where(self._will_clone, self.init_ids[self._clone_idx], self.init_ids) def weight_actions(self) -> np.ndarray: """Gets an approximation of the Q value function for a given state. It weights the number of times a given initial action appears in each state of the swarm. The the proportion of times each initial action appears on the swarm, is proportional to the Q value of that action. """ if isinstance(self._model, DiscreteModel): # return self.init_actions[self.rewards.argmax()] counts = np.bincount(self.init_actions, minlength=self._env.n_actions) return np.argmax(counts) vals = self.init_actions.sum(axis=0) return vals / self.n_walkers def update_data(self): init_actions = list(set(np.array(self.init_ids).astype(int))) walker_data = list(set(np.array(self.walkers_id).astype(int))) self.data.update_values(set(walker_data + init_actions)) def run_swarm(self, state: np.ndarray = None, obs: np.ndarray = None, print_swarm: bool = False): """ Iterate the swarm by evolving and cloning each walker until a certain condition is met. :return: """ self.reset() self.init_swarm(state=state, obs=obs) while not self.stop_condition(): # We calculate the clone condition, and then perturb the walkers before cloning # This allows the deaths to recycle faster, and the Swarm becomes more flexible if self._i_simulation > 1: self.clone_condition() self.step_walkers() if self._i_simulation > 1: self.clone() elif self._i_simulation == 0: self.init_ids = self.walkers_id.copy() self._i_simulation += 1 if self._i_simulation % self.render_every == 0 and print_swarm: print(self) clear_output(True) if print_swarm: print(self) def _update_n_samples(self): """This will adjust the number of samples we make for calculating an state swarm. In case we are doing poorly the number of samples will increase, and it will decrease if we are sampling further than the minimum mean time desired. """ limit_samples = self._max_samples_step / np.maximum(1e-7, self.balance) # Round and clip limit_clean = int(np.clip(np.ceil(limit_samples), 2, self.max_samples)) self._max_samples_step = max(limit_clean, self.n_walkers * self.min_horizon) def _update_n_walkers(self): """The number of parallel trajectories used changes every step. It tries to use enough swarm to make the mean time of the swarm tend to the minimum mean time selected. """ new_n = self.n_walkers * self.balance new_n = int(np.clip(np.ceil(new_n), 2, self.max_walkers)) self.n_walkers = new_n def _update_balance(self): """The balance parameter regulates the balance between how much you weight the distance of each state (exploration) with respect to its score (exploitation). A balance of 1 would mean that the computational resources assigned to a given decision have been just enough to reach the time horizon. This means that we can assign the same importance to exploration and exploitation. A balance lower than 1 means that we are not reaching the desired time horizon. This means that the algorithm is struggling to find a valid solution. In this case exploration should have more importance than exploitation. It also shows that we need to increase the computational resources. A balance higher than 1 means that we have surpassed the time horizon. This means that we are doing so well that we could use less computational resources and still meet the time horizon. This also means that we can give exploitation more importance, because we are exploring the state space well. """ self.balance = self.times.mean() / self.time_horizon def update_parameters(self): """Here we update the parameters of the algorithm in order to maintain the average time of the state swarm the closest to the minimum time horizon possible. """ self._save_steps.append(int( self._n_samples_done)) # Save for showing while printing. self._update_balance() if self.balance >= 1: # We are doing great if self.n_walkers == self.max_walkers: self._update_n_samples( ) # Decrease the samples so we can be faster. else: self._update_n_walkers( ) # Thi will increase the number of swarm else: # We are not arriving at the desired time horizon. if self._max_samples_step == self.max_samples: self._update_n_walkers( ) # Reduce the number of swarm to avoid useless clones. else: self._update_n_samples() # Increase the amount of computation. def stop_condition(self) -> bool: """This sets a hard limit on maximum samples. It also Finishes if all the walkers are dead, or the target score reached. """ stop_hard = self._n_samples_done > self._max_samples_step stop_score = False if self.reward_limit is None else \ self.rewards.max() >= self.reward_limit stop_terminal = self._end_cond.all() # Define game status so the user knows why game stopped. Only used when printing the Swarm if stop_hard: self._game_status = "Sample limit reached." elif stop_score: self._game_status = "Score limit reached." elif stop_terminal: self._game_status = "All the walkers died." else: self._game_status = "Playing..." return stop_hard or stop_score or stop_terminal def recover_game(self, index=None) -> tuple: """ By default, returns the game sampled with the highest score. :param index: id of the leaf where the returned game will finish. :return: a list containing the observations of the target sampled game. """ if index is None: index = self.walkers_id[self.rewards.argmax()] return self.tree.get_branch(index) def render_game(self, index=None, sleep: float = 0.02): """Renders the game stored in the tree that ends in the node labeled as index.""" idx = max(list(self.tree.data.nodes)) if index is None else index states, actions, dts = self.recover_game(idx) for state, action, dt in zip(states, actions, dts): self._env.step(action, state=state, n_repeat_action=dt) self._env.render() time.sleep(sleep) def estimate_distributions(self, state, obs): self.run_swarm(state=copy.deepcopy(state), obs=obs) self.update_parameters() rewards = self.get_expected_reward() if isinstance(self._model, DiscreteModel): # return self.init_actions[self.rewards.argmax()] counts = np.bincount(self.init_actions, minlength=self._env.n_actions) return counts / counts.sum(), rewards vals = self.init_actions.sum(axis=0) probs = vals / self.n_walkers return probs, rewards def get_expected_reward(self): init_act = self.init_actions max_rewards = np.array([ self.rewards[init_act == i].max() if len(self.rewards[init_act == i]) > 0 else 0 for i in range(self.env.n_actions) ]) return max_rewards # TODO: Adapt again for continous control problems. Open an issue if you need it. max_r = max_rewards.max() min_r = max_rewards.min() div = (max_r - min_r) normed = (max_rewards - min_r) / div if div != 0 else 1 + max_rewards return normed / normed.sum() def _skip_initial_frames(self) -> tuple: state, obs = self._env.reset(return_state=True) i_step, self._agent_reward, end = 0, 0, False info = {} _reward = 0 for i in range(self.skip_initial_frames): i_step += 1 action = 0 state, obs, _reward, _end, info = self._env.step( state=state, action=action, n_repeat_action=self.min_dt) self.tree.append_leaf(i_step, parent_id=i_step - 1, state=state, action=action, dt=self._env.n_repeat_action) self._agent_reward += _reward self._last_action = action end = info.get("terminal", _end) if end: break return state, obs, _reward, end, info, i_step def run_agent(self, render: bool = False, print_swarm: bool = False): """ :param render: :param print_swarm: :return: """ self.tree.reset() state, obs, _reward, end, info, i_step = self._skip_initial_frames() self._save_steps = [] self.tree.append_leaf(i_step, parent_id=i_step - 1, state=state, action=0, dt=1) while not end and self._agent_reward < self.reward_limit: i_step += 1 self.run_swarm(state=copy.deepcopy(state), obs=obs) action = self.weight_actions() state, obs, _reward, _end, info = self._env.step( state=state, action=action, n_repeat_action=self.min_dt) self.tree.append_leaf(i_step, parent_id=i_step - 1, state=state, action=action, dt=self._env.n_repeat_action) self._agent_reward += _reward self._last_action = action end = info.get("terminal", _end) if render: self._env.render() if print_swarm: print(self) clear_output(True) if self._update_parameters: self.update_parameters()
def __init__(self, env, model, n_walkers: int = 100, balance: float = 1., reward_limit: float = None, samples_limit: int = None, render_every: int = 1e10, accumulate_rewards: bool = True, dt_mean: float = None, dt_std: float = None, min_dt: int = 1, custom_reward: Callable = None, custom_end: Callable = None, process_obs: Callable = None, custom_skipframe: Callable = None, keep_best: bool = False, can_win: bool = False, skip_initial_frames: int = 0, max_samples_step: int = None, time_horizon: int = 40, min_horizon: int = 1, update_parameters: bool = False): """ :param env: Environment that will be sampled. :param model: Model used for sampling actions from observations. :param n_walkers: Number of walkers that the swarm will use :param balance: Balance coefficient for the virtual reward formula. :param reward_limit: Maximum reward that can be reached before stopping the swarm. :param samples_limit: Maximum number of time the Swarm can sample the environment befors stopping. :param render_every: Number of iterations that will be performed before printing the Swarm status. :param accumulate_rewards: Use the accumulated reward when scoring the walkers. False to use instantaneous reward. :param dt_mean: Mean skipframe used for exploring. :param dt_std: Standard deviation for the skipframe. Sampled from a normal distribution. :param min_dt: Minimum skipframe to be used by the swarm. :param custom_reward: Callable for calculating a custom reward function. :param custom_end: Callable for calculating custom boundary conditions. :param process_obs: Callable for doing custom observation processing. :param custom_skipframe: Callable for sampling the skipframe values of the walkers. :param keep_best: Keep track of the best accumulated reward found so far. :param can_win: If the game can be won when a given score is achieved, set to True. Meant to be used with Atari games like Boxing, Pong, IceHockey, etc. :param skip_initial_frames: Skip n frame when the game begins. :param max_samples_step: Maximum number of steps to be sampled per action. :param time_horizon: Desired path length allowed when calculating a step. :param min_horizon: Minimum path length allowed when calculating a step. :param update_parameters: Enable non-linear feedback loops to adjust internal params. """ self.skip_initial_frames = skip_initial_frames self.max_walkers = n_walkers self.time_horizon = time_horizon self.max_samples = max_samples_step self.min_horizon = min_horizon _max_samples = max_samples_step if max_samples_step is not None else 1e10 samples_limit = samples_limit if samples_limit is not None else 1e10 self._max_step_total = max(_max_samples, samples_limit) self._max_samples_step = min(_max_samples, n_walkers * time_horizon) super(FractalMC, self).__init__(env=env, model=model, n_walkers=self.max_walkers, balance=balance, reward_limit=reward_limit, samples_limit=self._max_samples_step, render_every=render_every, custom_reward=custom_reward, custom_end=custom_end, dt_mean=dt_mean, dt_std=dt_std, keep_best=keep_best, accumulate_rewards=accumulate_rewards, min_dt=min_dt, can_win=can_win, process_obs=process_obs, custom_skipframe=custom_skipframe) self.init_ids = np.zeros(self.n_walkers).astype(int) self._update_parameters = update_parameters self._save_steps = [] self._agent_reward = 0 self._last_action = None self.tree = DynamicTree()
class FractalMC(Swarm): def __init__(self, env, model, max_walkers: int = 100, balance: float = 1., time_horizon: int = 15, reward_limit: float = None, max_samples: int = None, render_every: int = 1e10, custom_reward: Callable = None, custom_end: Callable = None, dt_mean: float = None, dt_std: float = None, accumulate_rewards: bool = True, keep_best: bool = True, min_dt: int = 1): """ :param env: Environment that will be sampled. :param model: Model used for sampling actions from observations. :param max_walkers: Number of walkers that the swarm will use :param balance: Balance coefficient for the virtual reward formula. :param reward_limit: Maximum reward that can be reached before stopping the swarm. :param max_samples: Maximum number of time the Swarm can sample the environment befors stopping. :param render_every: Number of iterations that will be performed before printing the Swarm status. """ self.max_walkers = max_walkers self.time_horizon = time_horizon self.max_samples = max_samples _max_samples = max_samples if max_samples is not None else 1e10 self._max_samples_step = min(_max_samples, max_walkers * time_horizon) super(FractalMC, self).__init__(env=env, model=model, n_walkers=self.max_walkers, balance=balance, reward_limit=reward_limit, samples_limit=self._max_samples_step, render_every=render_every, custom_reward=custom_reward, custom_end=custom_end, dt_mean=dt_mean, dt_std=dt_std, keep_best=keep_best, accumulate_rewards=accumulate_rewards, min_dt=min_dt) self.init_ids = np.zeros(self.n_walkers).astype(int) self._save_steps = [] self._agent_reward = 0 self._last_action = None self.tree = DynamicTree() @property def init_actions(self): return self.data.get_actions(self.init_ids) def init_swarm(self, state: np.ndarray = None, obs: np.ndarray = None): self.init_ids = np.zeros(self.n_walkers).astype(int) super(FractalMC, self).init_swarm(state=state, obs=obs) def clone(self): super(FractalMC, self).clone() self.init_ids = np.where(self._will_clone, self.init_ids[self._clone_idx], self.init_ids) def weight_actions(self) -> np.ndarray: """Gets an approximation of the Q value function for a given state. It weights the number of times a given initial action appears in each state of the swarm. The the proportion of times each initial action appears on the swarm, is proportional to the Q value of that action. """ if isinstance(self._model, DiscreteModel): # return self.init_actions[self.rewards.argmax()] counts = np.bincount(self.init_actions) return np.argmax(counts) vals = self.init_actions.sum(axis=0) return vals / self.n_walkers def update_data(self): init_actions = list(set(np.array(self.init_ids).astype(int))) walker_data = list(set(np.array(self.walkers_id).astype(int))) self.data.update_values(set(walker_data + init_actions)) def run_swarm(self, state: np.ndarray = None, obs: np.ndarray = None, print_swarm: bool = False): """ Iterate the swarm by evolving and cloning each walker until a certain condition is met. :return: """ self.init_swarm(state=state, obs=obs) while not self.stop_condition(): try: # We calculate the clone condition, and then perturb the walkers before cloning # This allows the deaths to recycle faster, and the Swarm becomes more flexible if self._i_simulation > 1: self.clone_condition() self.step_walkers() if self._i_simulation > 1: self.clone() elif self._i_simulation == 0: self.init_ids = self.walkers_id.copy() self._i_simulation += 1 if self._i_simulation % self.render_every == 0 and print_swarm: print(self) clear_output(True) except KeyboardInterrupt: break if print_swarm: print(self) def _update_n_samples(self): """This will adjust the number of samples we make for calculating an state swarm. In case we are doing poorly the number of samples will increase, and it will decrease if we are sampling further than the minimum mean time desired. """ limit_samples = self._max_samples_step / np.maximum(1e-7, self.balance) # Round and clip limit_clean = int(np.clip(np.ceil(limit_samples), 2, self.max_samples)) self._max_samples_step = limit_clean def _update_n_walkers(self): """The number of parallel trajectories used changes every step. It tries to use enough swarm to make the mean time of the swarm tend to the minimum mean time selected. """ new_n = self.n_walkers * self.balance new_n = int(np.clip(np.ceil(new_n), 2, self.max_walkers)) self.n_walkers = new_n def _update_balance(self): """The balance parameter regulates the balance between how much you weight the distance of each state (exploration) with respect to its score (exploitation). A balance of 1 would mean that the computational resources assigned to a given decision have been just enough to reach the time horizon. This means that we can assign the same importance to exploration and exploitation. A balance lower than 1 means that we are not reaching the desired time horizon. This means that the algorithm is struggling to find a valid solution. In this case exploration should have more importance than exploitation. It also shows that we need to increase the computational resources. A balance higher than 1 means that we have surpassed the time horizon. This means that we are doing so well that we could use less computational resources and still meet the time horizon. This also means that we can give exploitation more importance, because we are exploring the state space well. """ self.balance = self.times.mean() / self.time_horizon def update_parameters(self): """Here we update the parameters of the algorithm in order to maintain the average time of the state swarm the closest to the minimum time horizon possible. """ self._save_steps.append(int( self._n_samples_done)) # Save for showing while printing. self._update_balance() if self.balance >= 1: # We are doing great if self.n_walkers == self.max_walkers: self._update_n_samples( ) # Decrease the samples so we can be faster. else: self._update_n_walkers( ) # Thi will increase the number of swarm else: # We are not arriving at the desired time horizon. if self._max_samples_step == self.max_samples: self._update_n_walkers( ) # Reduce the number of swarm to avoid useless clones. else: self._update_n_samples() # Increase the amount of computation. def stop_condition(self) -> bool: """This sets a hard limit on maximum samples. It also Finishes if all the walkers are dead, or the target score reached. """ stop_hard = self._n_samples_done > self._max_samples_step stop_score = False if self.reward_limit is None else \ self.rewards.max() >= self.reward_limit stop_terminal = self._end_cond.all() # Define game status so the user knows why game stopped. Only used when printing the Swarm if stop_hard: self._game_status = "Sample limit reached." elif stop_score: self._game_status = "Score limit reached." elif stop_terminal: self._game_status = "All the walkers died." else: self._game_status = "Playing..." return stop_hard or stop_score or stop_terminal def recover_game(self, index=None) -> tuple: """ By default, returns the game sampled with the highest score. :param index: id of the leaf where the returned game will finish. :return: a list containing the observations of the target sampled game. """ if index is None: index = self.walkers_id[self.rewards.argmax()] return self.tree.get_branch(index) def render_game(self, index=None, sleep: float = 0.02): """Renders the game stored in the tree that ends in the node labeled as index.""" idx = max(list(self.tree.data.nodes)) if index is None else index states, actions, dts = self.recover_game(idx) for state, action, dt in zip(states, actions, dts): self._env.step(action, state=state, n_repeat_action=dt) self._env.render() time.sleep(sleep) # def get_screen(self): # screen = self._env.render(mode='rgb_array').transpose( # (2, 0, 1)) # transpose into torch order (CHW) # # Strip off the top and bottom of the screen # screen = screen[:, 160:320] # view_width = 320 # cart_location = get_cart_location() # if cart_location < view_width // 2: # slice_range = slice(view_width) # elif cart_location > (screen_width - view_width // 2): # slice_range = slice(-view_width, None) # else: # slice_range = slice(cart_location - view_width // 2, # cart_location + view_width // 2) # # Strip off the edges, so that we have a square image centered on a cart # screen = screen[:, :, slice_range] # # Convert to float, rescare, convert to torch tensor # # (this doesn't require a copy) # screen = np.ascontiguousarray(screen, dtype=np.float32) / 255 # screen = torch.from_numpy(screen) # # Resize, and add a batch dimension (BCHW) # return resize(screen).unsqueeze(0).to(device) def run_agent(self, render: bool = False, print_swarm: bool = False): """ :param render: :param print_swarm: :return: """ self.tree.reset() i_step, self._agent_reward, end = 0, 0, False self._save_steps = [] # Clone emulator state w/ system state including pseudorandomness. # Restoring this state will give an identical environment. state, obs = self._env.reset(return_state=True) self.tree.append_leaf(i_step, parent_id=i_step - 1, state=state, action=0, dt=1) reward_sum = 0 net = dqn_model.DQN(self._env.observation_space.shape, self._env.action_space.n).to(device) tgt_net = dqn_model.DQN(self._env.observation_space.shape, self._env.action_space.n).to(device) buffer = dqn_agent.ExperienceBuffer(REPLAY_SIZE) agent = dqn_agent.Agent(self._env, buffer) optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) current_obs = obs start_time = time.time() while not end and self._agent_reward < self.reward_limit: i_step += 1 self.run_swarm(state=state.copy(), obs=obs) action = self.weight_actions() state, obs, _reward, _end, info = self._env.step( state=state, action=action, n_repeat_action=self.min_dt) # if not _end: # next_obs = obs # else: # next_obs = None # next_obs = obs #print("come here") exp = Experience(current_obs, action, _reward, _end, next_obs) current_obs = next_obs agent.exp_buffer.append(exp) reward_sum += _reward self.tree.append_leaf(i_step, parent_id=i_step - 1, state=state, action=action, dt=self._env.n_repeat_action) self._agent_reward += _reward self._last_action = action end = info.get("terminal", _end) #if _reward != 0: # print("i_step,_reward,_end", i_step, _reward, _end) if _end: print('ep %d: game over. episode reward total was %f' % (i_step, reward_sum)) if render: self._env.render() if print_swarm: print(self) clear_output(True) self.update_parameters() # train dqn model print("experiences exploration time/seconds:", time.time() - start_time) print("**************dqn agent training...*******************") #num_episodes = 1000 num_episodes = 10000 reward_sum = 0 start_time = time.time() for i_episode in range(num_episodes): optimizer.zero_grad() batch = agent.exp_buffer.sample(BATCH_SIZE) loss_t = dqn_agent.calc_loss(batch, net, tgt_net, device=device) loss_t.backward() optimizer.step() print("Train time/seconds:", time.time() - start_time) print("#############dqn agent testing...############") env = gym.make('Pong-v0') current_obs = env.reset() start_time = time.time() while True: state_a = np.array([current_obs], copy=False) state_v = torch.tensor(state_a, dtype=torch.float).to(device) q_vals_v = net(state_v) _, act_v = torch.max(q_vals_v, dim=1) action = int(act_v.item()) new_state, _reward, _end, _ = env.step(action) current_obs = new_state #if _reward != 0: # print("_reward,_end", _reward, _end) reward_sum += _reward if _end: print('game over. reward total was %f' % reward_sum) break print("Test time:/seconds:", time.time() - start_time) print("################test over##########################")
class SwarmWave(Swarm): def __init__(self, env, model, n_walkers: int = 100, balance: float = 1., reward_limit: float = None, samples_limit: int = None, render_every: int = 1e10, save_data: bool = True, accumulate_rewards: bool = True, dt_mean: float = None, dt_std: float = None, custom_reward: Callable = None, custom_end: Callable = None, keep_best: bool = False, min_dt: int = 1): """ :param env: Environment that will be sampled. :param model: Model used for sampling actions from observations. :param n_walkers: Number of walkers that the swarm will use :param balance: Balance coefficient for the virtual reward formula. :param reward_limit: Maximum reward that can be reached before stopping the swarm. :param samples_limit: Maximum number of time the Swarm can sample the environment befors stopping. :param render_every: Number of iterations that will be performed before printing the Swarm status. """ super(SwarmWave, self).__init__(env=env, model=model, n_walkers=n_walkers, balance=balance, reward_limit=reward_limit, samples_limit=samples_limit, render_every=render_every, accumulate_rewards=accumulate_rewards, dt_mean=dt_mean, dt_std=dt_std, custom_end=custom_end, custom_reward=custom_reward, keep_best=keep_best, min_dt=min_dt) self.save_data = save_data self.old_ids = np.zeros(self.n_walkers) self.tree = DynamicTree() if save_data else None self._current_index = None self._curr_states = [] self._curr_actions = [] self._curr_dts = [] self._current_ix = -1 def __str__(self): text = super(SwarmWave, self).__str__() if self.save_data: efi = (len(self.tree.data.nodes) / self._n_samples_done) * 100 sam_step = self._n_samples_done / len(self.tree.data.nodes) samples = len(self.tree.data.nodes) else: efi, samples, sam_step = 0, 0, 0 new_text = "{}\n"\ "Efficiency {:.2f}%\n" \ "Generated {} Examples |" \ " {:.2f} samples per example.\n".format(text, efi, samples, sam_step) return new_text def step_walkers(self): old_ids = self.walkers_id.copy() super(SwarmWave, self).step_walkers() if self.save_data: for i, idx in enumerate(self.walkers_id): self.tree.append_leaf( int(idx), parent_id=int(old_ids[i]), state=self.data.get_states([idx]).copy()[0], action=self.data.get_actions([idx]).copy()[0], dt=copy.deepcopy(self.dt[i])) def clone(self): super(SwarmWave, self).clone() # Prune tree to save memory if self.save_data: dead_leafs = list( set(self._pre_clone_ids) - set(self._post_clone_ids)) self.tree.prune_tree(dead_leafs, self._post_clone_ids) def recover_game(self, index=None) -> tuple: """ By default, returns the game sampled with the highest score. :param index: id of the leaf where the returned game will finish. :return: a list containing the observations of the target sampled game. """ if index is None: index = self.walkers_id[self.rewards.argmax()] return self.tree.get_branch(index) def render_game(self, index=None, sleep: float = 0.02): """Renders the game stored in the tree that ends in the node labeled as index.""" states, actions, dts = self.recover_game(index) for state, action, dt in zip(states, actions, dts): self._env.step(action, state=state, n_repeat_action=1) self._env.render() time.sleep(sleep) for i in range(max(0, dt - 1)): self._env.step(action, n_repeat_action=1) self._env.render() time.sleep(sleep)
class SwarmWave(Swarm): tree = DynamicTree() def __init__(self, env, model, n_walkers: int = 100, balance: float = 1., reward_limit: float = None, samples_limit: int = None, render_every: int = 1e10, accumulate_rewards: bool = True, dt_mean: float = None, dt_std: float = None, min_dt: int = 1, custom_reward: Callable = None, custom_end: Callable = None, process_obs: Callable = None, custom_skipframe: Callable = None, keep_best: bool = False, can_win: bool = False, save_data: bool = True, prune_tree: bool = True): """ :param env: Environment that will be sampled. :param model: Model used for sampling actions from observations. :param n_walkers: Number of walkers that the swarm will use :param balance: Balance coefficient for the virtual reward formula. :param reward_limit: Maximum reward that can be reached before stopping the swarm. :param samples_limit: Maximum number of time the Swarm can sample the environment befors stopping. :param render_every: Number of iterations that will be performed before printing the Swarm status. :param accumulate_rewards: Use the accumulated reward when scoring the walkers. False to use instantaneous reward. :param dt_mean: Mean skipframe used for exploring. :param dt_std: Standard deviation for the skipframe. Sampled from a normal distribution. :param min_dt: Minimum skipframe to be used by the swarm. :param custom_reward: Callable for calculating a custom reward function. :param custom_end: Callable for calculating custom boundary conditions. :param process_obs: Callable for doing custom observation processing. :param custom_skipframe: Callable for sampling the skipframe values of the walkers. :param keep_best: Keep track of the best accumulated reward found so far. :param can_win: If the game can be won when a given score is achieved, set to True. Meant to be used with Atari games like Boxing, Pong, IceHockey, etc. :param save_data: Store data to construct a tree of paths. :param prune_tree: Delete a path if no walker is expanding it. """ super(SwarmWave, self).__init__(env=env, model=model, n_walkers=n_walkers, balance=balance, reward_limit=reward_limit, samples_limit=samples_limit, render_every=render_every, accumulate_rewards=accumulate_rewards, dt_mean=dt_mean, dt_std=dt_std, custom_end=custom_end, custom_reward=custom_reward, keep_best=keep_best, min_dt=min_dt, process_obs=process_obs, can_win=can_win, custom_skipframe=custom_skipframe) self.save_data = save_data self.prune_tree = prune_tree self.old_ids = np.zeros(self.n_walkers) self._current_index = None self._curr_states = [] self._curr_actions = [] self._curr_dts = [] self._current_ix = -1 def __str__(self): text = super(SwarmWave, self).__str__() if self.save_data: efi = (len(self.tree.data.nodes) / self._n_samples_done) * 100 sam_step = self._n_samples_done / len(self.tree.data.nodes) samples = len(self.tree.data.nodes) else: efi, samples, sam_step = 0, 0, 0 new_text = "{}\n"\ "Efficiency {:.2f}%\n" \ "Generated {} Examples |" \ " {:.2f} samples per example.\n".format(text, efi, samples, sam_step) return new_text def init_swarm(self, state: np.ndarray = None, obs: np.ndarray = None): super(SwarmWave, self).init_swarm(state=state, obs=obs) self.tree.data.nodes[0][ "obs"] = obs if obs is not None else self.env.reset()[1] self.tree.data.nodes[0]["terminal"] = False def step_walkers(self): old_ids = self.walkers_id.copy() super(SwarmWave, self).step_walkers() if self.save_data: for i, idx in enumerate(self.walkers_id): self.tree.append_leaf( int(idx), parent_id=int(old_ids[i]), state=self.data.get_states([idx]).copy()[0], action=self.data.get_actions([idx]).copy()[0], dt=copy.deepcopy(self.dt[i])) def clone(self): super(SwarmWave, self).clone() # Prune tree to save memory if self.save_data and self.prune_tree: dead_leafs = list( set(self._pre_clone_ids) - set(self._post_clone_ids)) self.tree.prune_tree(dead_leafs, self._post_clone_ids) def recover_game(self, index=None) -> tuple: """ By default, returns the game sampled with the highest score. :param index: id of the leaf where the returned game will finish. :return: a list containing the observations of the target sampled game. """ if index is None: index = self.walkers_id[self.rewards.argmax()] return self.tree.get_branch(index) def render_game(self, index=None, sleep: float = 0.02): """Renders the game stored in the tree that ends in the node labeled as index.""" states, actions, dts = self.recover_game(index) for state, action, dt in zip(states, actions, dts): _, _, _, end, _ = self._env.step(action, state=state, n_repeat_action=1) self._env.render() time.sleep(sleep) for i in range(max(0, dt - 1)): self._env.step(action, n_repeat_action=1) self._env.render() time.sleep(sleep) def run_swarm(self, state: np.ndarray = None, obs: np.ndarray = None, print_swarm: bool = False): self.tree.reset() super(SwarmWave, self).run_swarm(state=state, obs=obs, print_swarm=print_swarm)
class FractalMC(Swarm): def __init__(self, env, model, max_walkers: int = 100, balance: float = 1., time_horizon: int = 15, reward_limit: float = None, max_samples: int = None, render_every: int = 1e10, custom_reward: Callable = None, custom_end: Callable = None, dt_mean: float = None, dt_std: float = None, accumulate_rewards: bool = True, keep_best: bool = True, min_dt: int = 1): """ :param env: Environment that will be sampled. :param model: Model used for sampling actions from observations. :param max_walkers: Number of walkers that the swarm will use :param balance: Balance coefficient for the virtual reward formula. :param reward_limit: Maximum reward that can be reached before stopping the swarm. :param max_samples: Maximum number of time the Swarm can sample the environment befors stopping. :param render_every: Number of iterations that will be performed before printing the Swarm status. """ self.max_walkers = max_walkers self.time_horizon = time_horizon self.max_samples = max_samples _max_samples = max_samples if max_samples is not None else 1e10 self._max_samples_step = min(_max_samples, max_walkers * time_horizon) super(FractalMC, self).__init__(env=env, model=model, n_walkers=self.max_walkers, balance=balance, reward_limit=reward_limit, samples_limit=self._max_samples_step, render_every=render_every, custom_reward=custom_reward, custom_end=custom_end, dt_mean=dt_mean, dt_std=dt_std, keep_best=keep_best, accumulate_rewards=accumulate_rewards, min_dt=min_dt) self.init_ids = np.zeros(self.n_walkers).astype(int) self._save_steps = [] self._agent_reward = 0 self._last_action = None self.tree = DynamicTree() @property def init_actions(self): return self.data.get_actions(self.init_ids) def init_swarm(self, state: np.ndarray = None, obs: np.ndarray = None): self.init_ids = np.zeros(self.n_walkers).astype(int) super(FractalMC, self).init_swarm(state=state, obs=obs) def clone(self): super(FractalMC, self).clone() self.init_ids = np.where(self._will_clone, self.init_ids[self._clone_idx], self.init_ids) def weight_actions(self) -> np.ndarray: """Gets an approximation of the Q value function for a given state. It weights the number of times a given initial action appears in each state of the swarm. The the proportion of times each initial action appears on the swarm, is proportional to the Q value of that action. """ if isinstance(self._model, DiscreteModel): # return self.init_actions[self.rewards.argmax()] counts = np.bincount(self.init_actions) return np.argmax(counts) vals = self.init_actions.sum(axis=0) return vals / self.n_walkers def update_data(self): init_actions = list(set(np.array(self.init_ids).astype(int))) walker_data = list(set(np.array(self.walkers_id).astype(int))) self.data.update_values(set(walker_data + init_actions)) def run_swarm(self, state: np.ndarray = None, obs: np.ndarray = None, print_swarm: bool = False): """ Iterate the swarm by evolving and cloning each walker until a certain condition is met. :return: """ self.init_swarm(state=state, obs=obs) while not self.stop_condition(): try: # We calculate the clone condition, and then perturb the walkers before cloning # This allows the deaths to recycle faster, and the Swarm becomes more flexible if self._i_simulation > 1: self.clone_condition() self.step_walkers() if self._i_simulation > 1: self.clone() elif self._i_simulation == 0: self.init_ids = self.walkers_id.copy() self._i_simulation += 1 if self._i_simulation % self.render_every == 0 and print_swarm: print(self) clear_output(True) except KeyboardInterrupt: break if print_swarm: print(self) def _update_n_samples(self): """This will adjust the number of samples we make for calculating an state swarm. In case we are doing poorly the number of samples will increase, and it will decrease if we are sampling further than the minimum mean time desired. """ limit_samples = self._max_samples_step / np.maximum(1e-7, self.balance) # Round and clip limit_clean = int(np.clip(np.ceil(limit_samples), 2, self.max_samples)) self._max_samples_step = limit_clean def _update_n_walkers(self): """The number of parallel trajectories used changes every step. It tries to use enough swarm to make the mean time of the swarm tend to the minimum mean time selected. """ new_n = self.n_walkers * self.balance new_n = int(np.clip(np.ceil(new_n), 2, self.max_walkers)) self.n_walkers = new_n def _update_balance(self): """The balance parameter regulates the balance between how much you weight the distance of each state (exploration) with respect to its score (exploitation). A balance of 1 would mean that the computational resources assigned to a given decision have been just enough to reach the time horizon. This means that we can assign the same importance to exploration and exploitation. A balance lower than 1 means that we are not reaching the desired time horizon. This means that the algorithm is struggling to find a valid solution. In this case exploration should have more importance than exploitation. It also shows that we need to increase the computational resources. A balance higher than 1 means that we have surpassed the time horizon. This means that we are doing so well that we could use less computational resources and still meet the time horizon. This also means that we can give exploitation more importance, because we are exploring the state space well. """ self.balance = self.times.mean() / self.time_horizon def update_parameters(self): """Here we update the parameters of the algorithm in order to maintain the average time of the state swarm the closest to the minimum time horizon possible. """ self._save_steps.append(int( self._n_samples_done)) # Save for showing while printing. self._update_balance() if self.balance >= 1: # We are doing great if self.n_walkers == self.max_walkers: self._update_n_samples( ) # Decrease the samples so we can be faster. else: self._update_n_walkers( ) # Thi will increase the number of swarm else: # We are not arriving at the desired time horizon. if self._max_samples_step == self.max_samples: self._update_n_walkers( ) # Reduce the number of swarm to avoid useless clones. else: self._update_n_samples() # Increase the amount of computation. def stop_condition(self) -> bool: """This sets a hard limit on maximum samples. It also Finishes if all the walkers are dead, or the target score reached. """ stop_hard = self._n_samples_done > self._max_samples_step stop_score = False if self.reward_limit is None else \ self.rewards.max() >= self.reward_limit stop_terminal = self._end_cond.all() # Define game status so the user knows why game stopped. Only used when printing the Swarm if stop_hard: self._game_status = "Sample limit reached." elif stop_score: self._game_status = "Score limit reached." elif stop_terminal: self._game_status = "All the walkers died." else: self._game_status = "Playing..." return stop_hard or stop_score or stop_terminal def recover_game(self, index=None) -> tuple: """ By default, returns the game sampled with the highest score. :param index: id of the leaf where the returned game will finish. :return: a list containing the observations of the target sampled game. """ if index is None: index = self.walkers_id[self.rewards.argmax()] return self.tree.get_branch(index) def render_game(self, index=None, sleep: float = 0.02): """Renders the game stored in the tree that ends in the node labeled as index.""" idx = max(list(self.tree.data.nodes)) if index is None else index states, actions, dts = self.recover_game(idx) for state, action, dt in zip(states, actions, dts): self._env.step(action, state=state, n_repeat_action=dt) self._env.render() time.sleep(sleep) def run_agent(self, render: bool = False, print_swarm: bool = False): """ :param render: :param print_swarm: :return: """ self.tree.reset() i_step, self._agent_reward, end = 0, 0, False self._save_steps = [] state, obs = self._env.reset(return_state=True) self.tree.append_leaf(i_step, parent_id=i_step - 1, state=state, action=0, dt=1) reward_sum = 0 while not end and self._agent_reward < self.reward_limit: i_step += 1 self.run_swarm(state=state.copy(), obs=obs) action = self.weight_actions() state, obs, _reward, _end, info = self._env.step( state=state, action=action, n_repeat_action=self.min_dt) reward_sum += _reward self.tree.append_leaf(i_step, parent_id=i_step - 1, state=state, action=action, dt=self._env.n_repeat_action) self._agent_reward += _reward self._last_action = action end = info.get("terminal", _end) print("i_step,_reward,_end", i_step, _reward, _end) if _end: print('ep %d: resetting env. episode reward total was %f' % (i_step, reward_sum)) if render: self._env.render() if print_swarm: print(self) clear_output(True) self.update_parameters()