Exemple #1
0
 def __init__(self, multihead_net: MultiheadNetwork, act_def: ContinuousDefinition, opts:SACAgentOptions=SACAgentOptions()):
     super().__init__()
     self.opts = opts
     self.act_def = act_def
     self.exp_buffer = ExperienceBuffer(self.opts.exp_buffer_size, act_def)
     self.multihead_net = multihead_net
     self.multihead_net.init_network(self.opts.learning_rate)
     
     self.init_entropy()
     if not getattr(self.multihead_net, "sample"):
         raise("Network must implement 'sample' method")
Exemple #2
0
    def __init__(self, network, act_def: DiscreteDefinition, opts=DQAgentOpts()):
        super().__init__()
        self.network = network
        self.target_network = copy.deepcopy(network)
        polyak_update(self.target_network, self.network, 1)
        
        # Freeze target
        for p in self.target_network.parameters():
            p.requires_grad = False

        self.opts = opts
        self.act_def = act_def
        self.exp_buffer = ExperienceBuffer(self.opts.exp_buffer_size)
        self.exp_stack = StackedState(self.opts.exp_stack_size)
        self.epsilon = 1.0
Exemple #3
0
    def __init__(self,
                 actor_network,
                 critic_network,
                 act_def: ContinuousDefinition,
                 opts=DDPGAgentOptions()):
        '''

        :param actor_network (nn.Module): Actor network
        :param critic_network (nn.Module): Critic network
        :param act_def (ContinuousDefinition): Action space definition
        :param opts:
        '''
        super().__init__()
        self.opts = opts
        self.actor_network = actor_network
        self.critic_network = critic_network
        self.act_def = act_def
        self.target_actor_network = copy.deepcopy(actor_network)
        self.target_critic_network = copy.deepcopy(critic_network)
        # Freeze target networks. They will never be updated with gradient descent/ascent
        for p in self.target_critic_network.parameters():
            p.requires_grad = False
        for p in self.target_actor_network.parameters():
            p.requires_grad = False

        # Init target networks with the same parameters as the source networks
        polyak_update(self.target_actor_network, self.actor_network, 1)
        polyak_update(self.target_critic_network, self.critic_network, 1)
        self.exp_buffer = ExperienceBuffer(self.opts.exp_buffer_size)

        # Initialize optimizer
        self.actor_optimizer = self.opts.actor_optimizer(
            self.actor_network.parameters(), self.opts.actor_learning_rate)
        self.critic_optimizer = self.opts.critic_optimizer(
            self.critic_network.parameters(), self.opts.critic_learning_rate)
        self.random_process = OrnsteinUhlenbeckProcess(size=2,
                                                       theta=0.15,
                                                       mu=0.0,
                                                       sigma=0.1)
Exemple #4
0
class DDPGAGent(Agent):
    def __init__(self,
                 actor_network,
                 critic_network,
                 act_def: ContinuousDefinition,
                 opts=DDPGAgentOptions()):
        '''

        :param actor_network (nn.Module): Actor network
        :param critic_network (nn.Module): Critic network
        :param act_def (ContinuousDefinition): Action space definition
        :param opts:
        '''
        super().__init__()
        self.opts = opts
        self.actor_network = actor_network
        self.critic_network = critic_network
        self.act_def = act_def
        self.target_actor_network = copy.deepcopy(actor_network)
        self.target_critic_network = copy.deepcopy(critic_network)
        # Freeze target networks. They will never be updated with gradient descent/ascent
        for p in self.target_critic_network.parameters():
            p.requires_grad = False
        for p in self.target_actor_network.parameters():
            p.requires_grad = False

        # Init target networks with the same parameters as the source networks
        polyak_update(self.target_actor_network, self.actor_network, 1)
        polyak_update(self.target_critic_network, self.critic_network, 1)
        self.exp_buffer = ExperienceBuffer(self.opts.exp_buffer_size)

        # Initialize optimizer
        self.actor_optimizer = self.opts.actor_optimizer(
            self.actor_network.parameters(), self.opts.actor_learning_rate)
        self.critic_optimizer = self.opts.critic_optimizer(
            self.critic_network.parameters(), self.opts.critic_learning_rate)
        self.random_process = OrnsteinUhlenbeckProcess(size=2,
                                                       theta=0.15,
                                                       mu=0.0,
                                                       sigma=0.1)

    def act(self, state, add_noise=False, uniform_noise=False):
        '''
        Generates an action using the actor network. During the network, add some noise to ensure exploration.
        Addition of a noise is the off-policy nature of DDPG
        :param state:
        :param add_noise:
        :return:
        '''
        if uniform_noise:
            action = np.random.uniform(self.act_def.lower_limit,
                                       self.act_def.upper_limit,
                                       size=self.act_def.shape)
            action = torch.tensor(action).float().to(self.actor_network.device)
        else:
            if type(state) is not torch.Tensor:
                state = torch.tensor(state).to(
                    self.actor_network.device).float()
            bias = (self.act_def.upper_limit + self.act_def.lower_limit) / 2
            bias = torch.tensor(bias).to(self.actor_network.device).float()
            bias.requires_grad = False
            scale = (self.act_def.upper_limit - self.act_def.lower_limit) / 2
            #action = self.actor_network.forward(state)*scale + bias
            action = self.actor_network.forward(state)
            if add_noise:
                if self.opts.noise_epsilon > 0:
                    #action = torch.add(action, torch.tensor(self.random_process.sample()).float().to(self.actor_network.device))
                    noise = torch.randn(action.shape,
                                        dtype=torch.float32) * math.sqrt(
                                            self.opts.noise_var)
                    action = action + noise.to(
                        self.actor_network.device).float()

        return action

    def learn(self, environment: Environment, n_episodes: int,
              n_iterations: int):
        avg_rewards = []
        for i in range(n_episodes):
            n_update_iter = 0  # Number of update iterations done. Needed to check if target networks need update
            curr_state = torch.tensor(environment.reset()).to(
                device=self.actor_network.device).float()
            episode_rewards = []
            while True:
                uniform_noise = False
                if n_update_iter < self.opts.uniform_noise_steps:
                    # Select a random action for early exploration
                    uniform_noise = True
                action = self.act(
                    curr_state, add_noise=True,
                    uniform_noise=uniform_noise).cpu().detach().numpy()
                next_state, reward, done, _ = environment.step(action)
                episode_rewards.append(reward)
                self.exp_buffer.add_experience(
                    curr_state,
                    torch.tensor(action).float(),
                    torch.tensor(reward).float(),
                    torch.tensor(next_state).float(), torch.tensor(done))
                curr_state = torch.tensor(next_state).float().to(
                    self.actor_network.device)
                curr_state.requires_grad = False
                self.opts.noise_epsilon = self.opts.noise_epsilon - self.opts.noise_depsilon
                if done:
                    self.reset()
                    total_episode_reward = np.array(episode_rewards).sum()
                    avg_rewards.append(total_episode_reward)
                    print(
                        "({}/{}) - End of episode with total reward: {} iteration: {}"
                        .format(i, n_episodes, total_episode_reward,
                                n_update_iter))
                    break
                if self.exp_buffer.is_accumulated():  # Do the updates
                    # Sample experiences
                    #self.critic_network.eval()
                    s_states, s_actions, s_rewards, s_next_states, s_done =\
                        self.exp_buffer.sample_tensor(self.opts.exp_batch_size, device=self.actor_network.device, dtype=torch.float32)

                    critic = self.critic_network.forward(
                        s_states, s_actions.detach())
                    target_actions = self.target_actor_network.forward(
                        s_next_states)
                    target_critics = self.target_critic_network.forward(
                        s_next_states, target_actions)
                    target = s_rewards.view(-1, 1) + self.opts.discount * (
                        1 - s_done.view(-1, 1)) * target_critics

                    # Run Gradient Descent on critic network
                    self.critic_optimizer.zero_grad()
                    #self.critic_network.train()  # Enable train mode
                    critic_loss = torch.nn.functional.mse_loss(critic, target)
                    critic_loss.backward()
                    self.critic_optimizer.step()

                    # Run Gradient Ascent on actor network
                    self.actor_optimizer.zero_grad()
                    self.actor_network.train()  # Enable train mode
                    actor_out = self.act(s_states)
                    actor_loss = -self.critic_network(s_states.detach(),
                                                      actor_out)
                    actor_loss = actor_loss.mean()
                    actor_loss.backward()
                    self.actor_optimizer.step()
                    #print(self.actor_network.fc3.weight.grad.mean())
                    self.update_target_networks(0.01)
                n_update_iter += 1  # One iteration is complete
        return avg_rewards

    def reset(self):
        self.random_process.reset_states()

    def save_model(self, PATH):
        torch.save(self.actor_network.state_dict(), PATH + "_actor")
        torch.save(self.critic_network.state_dict(), PATH + "_critic")

    def load_model(self, PATH):
        self.actor_network.load_state_dict(torch.load(PATH + "_actor"))
        self.critic_network.load_state_dict(torch.load(PATH + "_critic"))

    def update_target_networks(self, p):
        polyak_update(self.target_actor_network, self.actor_network, p)
        polyak_update(self.target_critic_network, self.critic_network, p)
Exemple #5
0
from Agents.ExperienceBuffer import ExperienceBuffer

exp = ExperienceBuffer(5)

exp.add_experience(0, 1, 4, 4, 0)
exp.add_experience(0, 2, 4, 5, 0)
exp.add_experience(0, 3, 4, 6, 0)
exp.add_experience(0, 4, 4, 7, 1)

states, actions, rewards, next_states, done = exp.sample(2)
for i in range(2):
    print("Sampled exp: {}, {}, {}, {}, {}".format(states[i], actions[i],
                                                   rewards[i], next_states[i],
                                                   done[i]))

import torch
import math
import numpy as np

t = torch.Tensor([1, 2])
noise = torch.randn(t.shape, dtype=torch.float32)
uni = np.random.uniform(-9.8, 9.8, size=(10, 1))
print(uni)
Exemple #6
0
class SACAgent(Agent):
    '''
    Implements a Soft Actor-Critic agent
    '''
    def __init__(self, multihead_net: MultiheadNetwork, act_def: ContinuousDefinition, opts:SACAgentOptions=SACAgentOptions()):
        super().__init__()
        self.opts = opts
        self.act_def = act_def
        self.exp_buffer = ExperienceBuffer(self.opts.exp_buffer_size, act_def)
        self.multihead_net = multihead_net
        self.multihead_net.init_network(self.opts.learning_rate)
        
        self.init_entropy()
        if not getattr(self.multihead_net, "sample"):
            raise("Network must implement 'sample' method")

    def init_entropy(self):
        if self.opts.auto_entropy:
            device = "cpu"
            if self.opts.use_gpu and torch.cuda.is_available():
                device = "cuda:0"
            self.entropy = torch.zeros(1, dtype=torch.float32, requires_grad=True, device=device)
            self.entropy_optimizer = optim.Adam([self.entropy], self.opts.auto_entrop_lr)

        else:
            self.entropy = self.opts.entropy_scale

    def act_cluster(self, new_state, n_episode):
        # Classify new_state
            index = self.exp_buffer.classify(new_state, self.opts.update_cluster_scale)
            # Get the action of the belonging cluster 
            cluster = self.exp_buffer.clusters[index]
            # Calculate next action
            if n_episode < self.opts.n_episodes_exploring_least_acts:
            # Exploring with using least used actions
                action = cluster.generate_action(self.act_def)
            else:
                # Using acitons with better rewards
                action = cluster.generate_action_reward(self.act_def)
            self.exp_buffer.last_cluster_id = index  # Just in case
            return action

    def act(self, new_state, evaluation=False):
        new_state = self.multihead_net.feature_extraction(new_state)
        # TODO: For now evaluation and no evaluation same. 
        if not evaluation:
            #new_state = torch.from_numpy(new_state).to(device).float().unsqueeze(0)
            with torch.no_grad():
                action, _, _  = self.multihead_net.sample(new_state, add_noise=True)
            return action.squeeze(0).detach().cpu().numpy()
        else:
            #new_state = torch.from_numpy(new_state).to(device).float().unsqueeze(0)
            with torch.no_grad():
                action , _, _  = self.multihead_net.sample(new_state, add_noise=False)
            return action.squeeze(0).detach().cpu().numpy()

    def update_params(self, n_iter, device):
        
        # Learn if enough data is accumulated
        if self.exp_buffer.is_accumulated(self.opts.exp_batch_size):
            if(self.opts.clustering and len(self.exp_buffer.clusters) == 0):
                return
            if self.multihead_net.base_optimizer is not None:
                self.multihead_net.base_optimizer.zero_grad()
            # Sample from buffer
            s_states, s_actions, s_rewards, s_next_states, s_done =\
            self.exp_buffer.sample_tensor(self.opts.exp_batch_size, device, torch.float)

            features = self.multihead_net(s_states)


            # Target Values
            with torch.no_grad():
                target_features = self.multihead_net(s_next_states)
                next_actions, log_probs, _ = self.multihead_net.sample(target_features, add_noise=True)
                critic_1_target, critic_2_target = self.multihead_net.get_target_critics(target_features, next_actions)
                critic_target = torch.min(critic_1_target, critic_2_target)
                target_value = critic_target - self.opts.entropy_scale*log_probs
                target_value = (target_value*(1-s_done.view(-1,1)))
                q_hat = s_rewards.view(-1,1) + self.opts.discount*target_value
                        
            # Optimize Critic
            self.multihead_net.critic_optimizer.zero_grad()
            critic_1, critic_2 = self.multihead_net.get_critics(features, s_actions)
            critic_loss_1 = F.mse_loss(critic_1, q_hat.detach())
            critic_loss_2 = F.mse_loss(critic_2, q_hat.detach())
            critic_loss = critic_loss_1 + critic_loss_2
            critic_loss.backward(retain_graph = True)
            self.multihead_net.critic_optimizer.step()

            # Optimize Policy
            self.multihead_net.policy_optimizer.zero_grad()
            # Calculate critic values for value and policy using the actions sampled from the current policy
            actions, log_probs, _ = self.multihead_net.sample(features, add_noise=True)
            critic_1_curr, critic_2_curr = self.multihead_net.get_critics(features, actions)
            critic_curr = torch.min(critic_1_curr, critic_2_curr)
            actor_loss = (self.opts.entropy_scale*log_probs - critic_curr).mean()
            actor_loss.backward()
            self.multihead_net.policy_optimizer.step()

            if self.multihead_net.base_net is not None:
                self.multihead_net.base_optimizer.step()

            if n_iter % 2500 == 0:
                print("Critic Loss 1: {}  - Critic Loss 2: {} - Actor Loss: {}".format(critic_loss_1.item(), critic_loss_2.item(),
                 actor_loss.item()))


            if n_iter % 1 == 0:
                self.multihead_net.update_targets(self.opts.tau)

    def load_checkpoint(self, checkpoint):
        all_rewards = []
        avg_rewards = []
        if checkpoint is not None:
            filepath = checkpoint
            reward_path = os.path.dirname(filepath)
            reward_path = os.path.join(reward_path, "rewards")
            self.load_model(filepath)
            with open(reward_path, 'r') as reward_file:
                reward_dict = json.load(reward_file)
                all_rewards = reward_dict['Rewards']
                avg_rewards = reward_dict['Averages']
        
        return all_rewards, avg_rewards

    def learn(self, env:Environment, trnOpts: TrainOpts):
        device = "cpu"
        if self.opts.use_gpu and torch.cuda.is_available():
            device = "cuda:0"
        
        # Load checkpoint
        all_rewards, avg_rewards = self.load_checkpoint(trnOpts.checkpoint)

        self.multihead_net.to(device)
   
        n_iter = 0
        e = 0
        max_episodes = trnOpts.n_episodes
        max_steps = trnOpts.n_iterations

        while e < max_episodes:  # Looping episodes
            if max_steps > 0 and n_iter > max_steps:
                break
            curr_state = env.reset()
            if type(curr_state) is not torch.Tensor:
                curr_state = torch.from_numpy(curr_state).to(device).float()
            curr_state = curr_state.unsqueeze(0)
            episode_rewards = []
            step = 0
            while True:
                n_iter += 1
                step += 1
                # Collect experience
                # e < self.opts.n_episodes_exploring => This can be added too
                clustering = self.opts.clustering and e < self.opts.n_episodes_exploring and len(self.exp_buffer.clusters) > 0  # Cluster count being higher than 0 means that clustering has been done
                
                with torch.no_grad():
                    if clustering:
                        action = self.act_cluster(curr_state, e)
                    else:
                        action = self.act(curr_state, device)
                    next_state, reward, done, _ = env.step(action)
          
                    if self.opts.render:
                        env.render()
                    episode_rewards.append(reward)

                    if clustering:
                        self.exp_buffer.clusters[self.exp_buffer.last_cluster_id].add_action(action, reward)
                      
                    # Check Clustering
                    if self.opts.clustering  and len(self.exp_buffer) > self.opts.cluster_samples \
                    and len(self.exp_buffer.clusters) == 0:  # It means that clustering already done
                        print("Clustering")
                        self.exp_buffer.cluster(self.opts.n_clusters, self.opts.use_elbow_plot)

                if type(next_state) is not torch.Tensor:
                    next_state = torch.from_numpy(next_state).to(device).float()
                next_state = next_state.unsqueeze(0)

                self.exp_buffer.add_experience(curr_state.detach().cpu().squeeze(0), action, reward, next_state.detach().cpu().squeeze(0), done)   
    
                if done:
                   
                    if not self.exp_buffer.is_accumulated(self.opts.exp_batch_size) or (self.opts.clustering and len(self.exp_buffer.states) < self.opts.cluster_samples):
                        print("Accumulating buffer iteration: {}".format(n_iter))
                    
                    else:
                        episode_end_reward = np.array(episode_rewards).sum()
                        all_rewards.append(episode_end_reward)
                        e += 1  # current filepdate episode
                        avg_reward = np.mean(all_rewards[-100:])
                        avg_rewards.append(avg_reward)
                        print("({}/{}) - End of episode with total reward: {} - Avg Reward: {} Total Iter: {}".format(e, max_episodes, episode_end_reward, avg_reward, step))
                    break
                
                curr_state = next_state

                # Learn if enough data is accumulated
                if self.exp_buffer.is_accumulated(self.opts.exp_batch_size):
                    #self.update_params(n_iter, device)
                    start = time.time()
                    self.update_params(n_iter, device)
                    end = time.time()
                    print("Elapsed :{}".format(end-start))
                    
                if n_iter > 0 and self.opts.save_frequency > 0 and n_iter % self.opts.save_frequency == 0:
                    print("Saving at iteration {}".format(n_iter))
                    path = os.path.join(trnOpts.save_path, time.strftime("%Y%m%d-%H%M%S"))

                    self.save_model(path)
                    self.save_rewards(path, all_rewards, avg_rewards)
        
        return all_rewards, avg_rewards

    def save_model(self, PATH):
        # Check Path
        try:
            if not os.path.exists(PATH):
                os.mkdir(PATH)
            torch.save(self.multihead_net.state_dict(), os.path.join(PATH, "multihead"))
        except:
            print("Couldn't save model")
    
    def save_rewards(self, PATH, all_rewards, avg_rewards):
        try:
            if not os.path.exists(PATH):
                os.mkdir(PATH)
            torch.save(self.multihead_net.state_dict(), os.path.join(PATH, "multihead"))
            info = {"Rewards":all_rewards, "Averages":avg_rewards}
            with open(os.path.join(PATH, "rewards"), 'w') as fp:
                json.dump(info, fp)
        except:
            print("Couldn't save rewards")

    def load_model(self, PATH):
        state_dict  = torch.load(PATH)
        self.multihead_net.load_state_dict(state_dict)

    def reset():
        pass
Exemple #7
0
class DQAgent(Agent):
    def __init__(self, network, act_def: DiscreteDefinition, opts=DQAgentOpts()):
        super().__init__()
        self.network = network
        self.target_network = copy.deepcopy(network)
        polyak_update(self.target_network, self.network, 1)
        
        # Freeze target
        for p in self.target_network.parameters():
            p.requires_grad = False

        self.opts = opts
        self.act_def = act_def
        self.exp_buffer = ExperienceBuffer(self.opts.exp_buffer_size)
        self.exp_stack = StackedState(self.opts.exp_stack_size)
        self.epsilon = 1.0


    def act(self, new_state, device="cpu"):
        with torch.no_grad():
            if np.random.random() < self.epsilon:
                action = random.randint(0, len(self.act_def.samples)-1)
            else:
                action = np.argmax(self.network(torch.tensor(new_state).float().to(device).unsqueeze(0)).detach().cpu().numpy())
        return action
        
    def act_greedy(self, new_state):
        with torch.no_grad():
            net_out = self.network(new_state.unsqueeze(0))
            action = np.argmax(net_out.detach().cpu().numpy())
        return action

    def learn(self, env: Environment, max_episodes: int, max_steps: int):
        device = "cpu"
        if self.opts.use_gpu and torch.cuda.is_available():
            device = "cuda:0"
        self.network.to(device)
        self.target_network.to(device)
        self.reset()
        total_steps = 0
        optimizer = self.opts.optimizer(self.network.parameters(), self.opts.learning_rate)
        avg_rewards = []
        losses = []
        learning_complete = False
        episodes_passed = 0
        while not learning_complete:
            current_step = 0
            target_update_iter = 0
            episode_rewards = []
            curr_state = env.reset()
            action = 0
            if self.opts.use_exp_stack:
                curr_state = self.exp_stack.add_and_get(curr_state)
            #curr_state = torch.tensor(curr_state).to(device).float()
            if episodes_passed > max_episodes:
                    learning_complete = True
                    break
            while True:
                done = 0
                with torch.no_grad():  # Just collecting experience
                    for i in range(self.opts.exp_stack_size-1):
                        action = self.act(curr_state, device)
                    next_state, reward, done, _ = env.step(self.act_def[action])
                    self.exp_stack.add_state(next_state)
                    total_steps += 1 # Doesn't reset
                    next_state = self.exp_stack.get_stacked_states()
                    episode_rewards.append(reward)
                    self.exp_buffer.add_experience(curr_state, action, reward, next_state, done)
                    curr_state = next_state
                    if self.opts.render:
                        env.render()
        
                if done or current_step > max_steps:
                    self.reset()
                    total_episode_reward = np.array(episode_rewards).sum()
                    avg_rewards.append(total_episode_reward)
                    print("({}/{}) - End of episode with total reward: {} iteration: {} Memory Size: {}".format(episodes_passed, max_episodes, total_episode_reward, current_step, len(self.exp_buffer)))
                    break
                
                if self.exp_buffer.is_accumulated():
                    s_states, s_actions, s_rewards, s_next_states, s_done =\
                    self.exp_buffer.sample_numpy(self.opts.exp_batch_size)

                    # TODO: n-step Q-learning
                    optimizer.zero_grad()
                    with torch.no_grad():
                        s_next_states = torch.from_numpy(s_next_states).to(device).float()
                        s_done = torch.from_numpy(s_done).to(device).float()
                        s_rewards = torch.from_numpy(s_rewards).to(device).float()
                        next_state_vals = self.target_network(s_next_states)*(1-s_done.view(-1,1))  # Terminal states has V(s) = 0. That is why we use s_done
                        next_state_vals = next_state_vals*self.opts.discount  # Discount the reward
                        td_target = s_rewards + next_state_vals.max(1)[0].detach()  # In TD target, use target network (see Double Q learning)

                    #loss = -self.opts.loss(td_target, self.network(s_states))
                    s_states = torch.from_numpy(s_states).to(device).float()
                    s_actions = torch.from_numpy(s_actions).to(device).to(torch.int64)
                    curr_state_estimations = self.network(s_states).gather(1, s_actions.view(-1,1))
                    loss = torch.nn.functional.mse_loss(curr_state_estimations, td_target.unsqueeze(1))
                    loss.backward()
                    optimizer.step()

                    target_update_iter += 1
                    
                    losses.append(loss.item())
                    # Update target network
                    if target_update_iter > self.opts.target_update_freq:
                        target_update_iter = 0
                        polyak_update(self.target_network, self.network, 1)
                        print("Update target at step {}".format(total_steps))
                    
                if self.opts.verbose and total_steps%self.opts.verbose_frequency == 0 and len(losses) > 0:
                    print("Total Steps:{} - Loss:{} - Curr Epsilon:{}".format(total_steps, losses[-1], self.epsilon))
                current_step += 1  # Resets every episode
                
                
            
            if self.exp_buffer.is_accumulated():
                episodes_passed += 1  # Increment episode only if enough experience is collected
                
            self.epsilon = self.opts.min_epsilon + (self.opts.max_epsilon - self.opts.min_epsilon)*np.exp(-1.0*episodes_passed/self.opts.epsilon_decay) 

        return avg_rewards, losses

    def save_model(self, PATH):
        torch.save(self.network.state_dict(), PATH)

    def load_model(self, PATH):
        self.network.load_state_dict(torch.load(PATH))

    def reset(self):
        self.exp_stack.reset()