def __init__(self,
                 num_steps,
                 num_processes,
                 obs_shape,
                 action_space,
                 recurrent_hidden_state_size,
                 norm_rew=False):
        self.obs = torch.zeros(num_steps + 1, num_processes, *obs_shape)
        self.recurrent_hidden_states = torch.zeros(
            num_steps + 1, num_processes, recurrent_hidden_state_size)
        self.rewards = torch.zeros(num_steps, num_processes, 1)
        self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
        self.returns = torch.zeros(num_steps + 1, num_processes, 1)
        self.action_log_probs = torch.zeros(num_steps, num_processes, 1)
        self.norm_rew = norm_rew

        if self.norm_rew:
            self.ret_running_mean_std = RunningMeanStd()

        if action_space.__class__.__name__ == 'Discrete':
            action_shape = 1
            self.n_actions = action_space.n
        else:
            action_shape = action_space.shape[0]
            self.n_actions = None
        self.actions = torch.zeros(num_steps, num_processes, action_shape)
        if action_space.__class__.__name__ == 'Discrete':
            self.actions = self.actions.long()
        self.masks = torch.ones(num_steps + 1, num_processes, 1)

        self.num_steps = num_steps
        self.step = 0
Esempio n. 2
0
    def __init__(self,
                 env: Any,
                 agent: Any,
                 save_interval: int = 1000,
                 train_episode: int = 10**9,
                 num_eval_episode: int = 3,
                 episode_len: int = 3000,
                 pre_step: int = 10000,
                 gamma: float = 0.995,
                 int_gamma: float = 0.995,
                 lam: float = 0.97,
                 device=torch.device('cpu'),
                 int_coef: float = 1,
                 ext_coef: float = 0.3,
                 eval_interval: int = 10**4,
                 seed: int = 0):
        self.save_interval = save_interval
        self.eval_interval = eval_interval
        # prepare envs
        self.env = env
        self.env.seed(seed)
        self.env_test = deepcopy(env)
        self.env_test.seed(2**31 - seed)
        self.agent = agent

        # pepare steps
        self.global_step = 0
        self.step_in_episode = 0
        self.episode_so_far = 0

        self.episode_len = episode_len  # length of an episode
        self.num_eval_episode = num_eval_episode
        self.train_episode = train_episode
        self.pre_step = pre_step  # number of steps used to measure variance of states
        self.reward_rms = RunningMeanStd()
        obs_sampled = self.env.reset()
        self.obs_rms = RunningMeanStd(shape=[1] + list(obs_sampled.shape))
        self.device = device
        self.lam = lam
        self.gamma = gamma
        self.int_gamma = int_gamma  # gamma for intrinsic reward
        # ratio of intrinsic and extrinsic rewards
        self.int_coef = int_coef
        self.ext_coef = ext_coef
        self.reward_in_episode = 0.0
        self.returns = {'step': [], 'return': []}
Esempio n. 3
0
 def __init__(self,
              venv,
              ob=True,
              ret=True,
              clipob=10.,
              cliprew=10.,
              gamma=0.99,
              epsilon=1e-8):
     self.venv = venv
     self._ob_space = venv.observation_space
     self._ac_space = venv.action_space
     self.ob_rms = RunningMeanStd(
         shape=self._ob_space.shape) if ob else None
     self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
Esempio n. 4
0
    def __init__(self, env, policy, rnd, replay_buffer, logger, args):
        self.env = env
        # Models
        self.policy = policy
        self.rnd = rnd
        # Utils
        self.replay_buffer = replay_buffer
        self.logger = logger

        self.obs_running_mean = RunningMeanStd((84, 84, 1))
        self.rew_running_mean = RunningMeanStd(())

        self.last_enc_loss = None
        self.train_enc_next_itr = False

        # Args
        self.use_encoder = args['use_encoder']
        self.encoder_train_limit = args['encoder_train_limit']

        self.num_random_samples = args['num_random_samples']
        self.log_rate = args['log_rate']
Esempio n. 5
0
    def __init__(self,
                 gamma,
                 tau,
                 num_inputs,
                 action_space,
                 replay_size,
                 normalize_obs=True,
                 normalize_returns=False):
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
            torch.backends.cudnn.enabled = False
            self.Tensor = torch.cuda.FloatTensor
        else:
            self.device = torch.device('cpu')
            self.Tensor = torch.FloatTensor

        self.num_inputs = num_inputs
        self.action_space = action_space

        self.gamma = gamma
        self.tau = tau
        self.normalize_observations = normalize_obs
        self.normalize_returns = normalize_returns

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd(shape=num_inputs)
        else:
            self.obs_rms = None

        if self.normalize_returns:
            self.ret_rms = RunningMeanStd(shape=1)
            self.ret = 0
            self.cliprew = 10.0
        else:
            self.ret_rms = None

        self.memory = ReplayMemory(replay_size)
        self.actor = None
        self.actor_perturbed = None
Esempio n. 6
0
class DDPG:
    def __init__(self, beta, epsilon, learning_rate, gamma, tau, hidden_size_dim0, hidden_size_dim1, num_inputs, action_space, train_mode, alpha, replay_size,
                 optimizer, two_player, normalize_obs=True, normalize_returns=False, critic_l2_reg=1e-2):
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
            torch.backends.cudnn.enabled = False
            self.Tensor = torch.cuda.FloatTensor
        else:
            self.device = torch.device('cpu')
            self.Tensor = torch.FloatTensor

        self.alpha = alpha
        self.train_mode = train_mode

        self.num_inputs = num_inputs
        self.action_space = action_space
        self.critic_l2_reg = critic_l2_reg

        self.actor = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
        self.adversary = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
        if self.train_mode:
            self.actor_target = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            self.actor_bar = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            self.actor_outer = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            if(optimizer == 'SGLD'):
                self.actor_optim = SGLD(self.actor.parameters(), lr=1e-4, noise=epsilon, alpha=0.999)
            elif(optimizer == 'RMSprop'):
                self.actor_optim = RMSprop(self.actor.parameters(), lr=1e-4, alpha=0.999)
            else:
                self.actor_optim = ExtraAdam(self.actor.parameters(), lr=1e-4)

            self.critic = Critic(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            self.critic_target = Critic(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            self.critic_optim = Adam(self.critic.parameters(), lr=1e-3, weight_decay=critic_l2_reg)

            self.adversary_target = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            self.adversary_bar = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            self.adversary_outer = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            if(optimizer == 'SGLD'):
                self.adversary_optim = SGLD(self.adversary.parameters(), lr=1e-4, noise=epsilon, alpha=0.999)
            elif(optimizer == 'RMSprop'):
                self.adversary_optim = RMSprop(self.adversary.parameters(), lr=1e-4, alpha=0.999)
            else:
                self.adversary_optim = ExtraAdam(self.adversary.parameters(), lr=1e-4)
			
            hard_update(self.adversary_target, self.adversary)  # Make sure target is with the same weight
            hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
            hard_update(self.critic_target, self.critic)

        self.gamma = gamma
        self.tau = tau
        self.beta = beta
        self.epsilon = epsilon
        self.learning_rate = learning_rate
        self.normalize_observations = normalize_obs
        self.normalize_returns = normalize_returns
        self.optimizer = optimizer
        self.two_player = two_player
        if self.normalize_observations:
            self.obs_rms = RunningMeanStd(shape=num_inputs)
        else:
            self.obs_rms = None

        if self.normalize_returns:
            self.ret_rms = RunningMeanStd(shape=1)
            self.ret = 0
            self.cliprew = 10.0
        else:
            self.ret_rms = None

        self.memory = ReplayMemory(replay_size)
       
    def eval(self):
        self.actor.eval()
        self.adversary.eval()
        if self.train_mode:
            self.critic.eval()

    def train(self):
        self.actor.train()
        self.adversary.train()
        if self.train_mode:
            self.critic.train()

    def select_action(self, state, action_noise=None, param_noise=None, mdp_type='mdp'):
        state = normalize(Variable(state).to(self.device), self.obs_rms, self.device)

        if mdp_type != 'mdp':
            
            if(self.optimizer == 'SGLD' and self.two_player):
                mu = self.actor_outer(state)
            else:
                mu = self.actor(state)
            mu = mu.data
            if action_noise is not None:
                mu += self.Tensor(action_noise()).to(self.device)

            mu = mu.clamp(-1, 1) * (1 - self.alpha)

            if(self.optimizer == 'SGLD' and self.two_player):
                adv_mu = self.adversary_outer(state)
            else:
                adv_mu = self.adversary(state)
            adv_mu = adv_mu.data.clamp(-1, 1) * self.alpha
            mu += adv_mu
            
        else:
 
            if(self.optimizer == 'SGLD' and self.two_player):
                mu = self.actor_outer(state)
            else:
                mu = self.actor(state)

            mu = mu.data
            if action_noise is not None:
                mu += self.Tensor(action_noise()).to(self.device)

            mu = mu.clamp(-1, 1)

        return mu

    def update_robust_non_flip(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch,
                      mdp_type, robust_update_type):
        # TRAIN CRITIC
        if robust_update_type == 'full':            
            next_action_batch = (1 - self.alpha) * self.actor_target(next_state_batch) \
                                    + self.alpha * self.adversary_target(next_state_batch)

            next_state_action_values = self.critic_target(next_state_batch, next_action_batch)
            expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values

            self.critic_optim.zero_grad()
            state_action_batch = self.critic(state_batch, action_batch)

            value_loss = F.mse_loss(state_action_batch, expected_state_action_batch)
            value_loss.backward()
            self.critic_optim.step()
            value_loss = value_loss.item()
        else:
            value_loss = 0
        
        # TRAIN ADVERSARY
        self.adversary_optim.zero_grad() 
        with torch.no_grad():
            if(self.optimizer == 'SGLD' and self.two_player):
                real_action = self.actor_outer(next_state_batch)
            else: 
                real_action = self.actor_target(next_state_batch)
        action = (1 - self.alpha) * real_action + self.alpha * self.adversary(next_state_batch)
        adversary_loss = self.critic(state_batch, action)
        adversary_loss = adversary_loss.mean()
        adversary_loss.backward()
        self.adversary_optim.step()
        adversary_loss = adversary_loss.item()
            
        # TRAIN ACTOR
        self.actor_optim.zero_grad()
        with torch.no_grad():
            if(self.optimizer == 'SGLD' and self.two_player):
                adversary_action = self.adversary_outer(next_state_batch)
            else:
                adversary_action = self.adversary_target(next_state_batch)
        action = (1 - self.alpha) * self.actor(next_state_batch) + self.alpha * adversary_action
        policy_loss = -self.critic(state_batch, action)
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()
        policy_loss = policy_loss.item()
           
        return value_loss, policy_loss, adversary_loss
  
    def update_robust_flip(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch, adversary_update,
                      mdp_type, robust_update_type):
        # TRAIN CRITIC
        if robust_update_type == 'full':
           
            next_action_batch = (1 - self.alpha) * self.actor_target(next_state_batch) \
                                    + self.alpha * self.adversary_target(next_state_batch)

            next_state_action_values = self.critic_target(next_state_batch, next_action_batch)
            expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values

            self.critic_optim.zero_grad()

            state_action_batch = self.critic(state_batch, action_batch)

            value_loss = F.mse_loss(state_action_batch, expected_state_action_batch)
            value_loss.backward()
            self.critic_optim.step()
            value_loss = value_loss.item()
        else:
            value_loss = 0

        if adversary_update:
            # TRAIN ADVERSARY
            self.adversary_optim.zero_grad()
           
            with torch.no_grad():
                real_action = self.actor_target(next_state_batch)
            action = (1 - self.alpha) * real_action + self.alpha * self.adversary(next_state_batch)
            adversary_loss = self.critic(state_batch, action)

            adversary_loss = adversary_loss.mean()
            adversary_loss.backward()
            self.adversary_optim.step()
            adversary_loss = adversary_loss.item()
            policy_loss = 0
        else:
            # TRAIN ACTOR
            self.actor_optim.zero_grad()
            with torch.no_grad():
                adversary_action = self.adversary_target(next_state_batch)
            action = (1 - self.alpha) * self.actor(next_state_batch) + self.alpha * adversary_action
            policy_loss = -self.critic(state_batch, action)

            policy_loss = policy_loss.mean()
            policy_loss.backward()
            self.actor_optim.step()
            policy_loss = policy_loss.item()
            adversary_loss = 0

        return value_loss, policy_loss, adversary_loss
  
    def update_non_robust(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch):
        
        # TRAIN CRITIC

        next_action_batch = self.actor_target(next_state_batch)
        next_state_action_values = self.critic_target(next_state_batch, next_action_batch)

        expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values

        self.critic_optim.zero_grad()
        state_action_batch = self.critic(state_batch, action_batch)
        value_loss = F.mse_loss(state_action_batch, expected_state_action_batch)
        value_loss.backward()
        self.critic_optim.step()

        # TRAIN ACTOR
        self.actor_optim.zero_grad()
        action = self.actor(next_state_batch)
        policy_loss = -self.critic(state_batch, action)
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()
        policy_loss = policy_loss.item()
        adversary_loss = 0

        return value_loss.item(), policy_loss, adversary_loss

    def store_transition(self, state, action, mask, next_state, reward):
        B = state.shape[0]
        for b in range(B):
            self.memory.push(state[b], action[b], mask[b], next_state[b], reward[b])
            if self.normalize_observations:
                self.obs_rms.update(state[b].cpu().numpy())
            if self.normalize_returns:
                self.ret = self.ret * self.gamma + reward[b]
                self.ret_rms.update(np.array([self.ret]))
                if mask[b] == 0:  # if terminal is True
                    self.ret = 0

    def update_parameters(self, batch_size, sgld_outer_update, mdp_type='mdp', exploration_method='mdp'):
        transitions = self.memory.sample(batch_size)
        batch = Transition(*zip(*transitions))

        if mdp_type != 'mdp':
            robust_update_type = 'full'
        elif exploration_method != 'mdp':
            robust_update_type = 'adversary'
        else:
            robust_update_type = None

        state_batch = normalize(Variable(torch.stack(batch.state)).to(self.device), self.obs_rms, self.device)
        action_batch = Variable(torch.stack(batch.action)).to(self.device)
        reward_batch = normalize(Variable(torch.stack(batch.reward)).to(self.device).unsqueeze(1), self.ret_rms, self.device)
        mask_batch = Variable(torch.stack(batch.mask)).to(self.device).unsqueeze(1)
        next_state_batch = normalize(Variable(torch.stack(batch.next_state)).to(self.device), self.obs_rms, self.device)

        if self.normalize_returns:
            reward_batch = torch.clamp(reward_batch, -self.cliprew, self.cliprew)

        value_loss = 0
        policy_loss = 0
        adversary_loss = 0
        
        if robust_update_type is not None:
           
            _value_loss, _policy_loss, _adversary_loss = self.update_robust_non_flip(state_batch, action_batch, reward_batch,
                                                                            mask_batch, next_state_batch,                                                                                                                            mdp_type, robust_update_type)
            value_loss += _value_loss
            policy_loss += _policy_loss
            adversary_loss += _adversary_loss
           
        if robust_update_type != 'full':
            _value_loss, _policy_loss, _adversary_loss = self.update_non_robust(state_batch, action_batch,
                                                                                reward_batch,
                                                                                mask_batch, next_state_batch)
            value_loss += _value_loss
            policy_loss += _policy_loss
            adversary_loss += _adversary_loss
        
        if(self.optimizer == 'SGLD' and self.two_player):   
            self.sgld_inner_update()
        self.soft_update()
        if(sgld_outer_update and self.optimizer == 'SGLD' and self.two_player):
            self.sgld_outer_update()

        return value_loss, policy_loss, adversary_loss

    def initialize(self):
        hard_update(self.actor_bar, self.actor_outer)
        hard_update(self.adversary_bar, self.adversary_outer)
        hard_update(self.actor, self.actor_outer)
        hard_update(self.adversary, self.adversary_outer)

    def sgld_inner_update(self): #target source
        sgld_update(self.actor_bar, self.actor, self.beta)
        sgld_update(self.adversary_bar, self.adversary, self.beta)

    def sgld_outer_update(self): #target source
        sgld_update(self.actor_outer, self.actor_bar, self.beta)
        sgld_update(self.adversary_outer, self.adversary_bar, self.beta)

    def soft_update(self):
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.adversary_target, self.adversary, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
Esempio n. 7
0
    def __init__(self,
                 action_dim,
                 state_dim,
                 buffer_size=1000000,
                 action_samples=10,
                 mode='linear',
                 beta=1,
                 tau=5e-3,
                 q_normalization=0.01,
                 gamma=0.99,
                 normalize_obs=False,
                 normalize_rewards=False,
                 batch_size=64,
                 actor='AIQN',
                 *args,
                 **kwargs):
        """
        Agent class to generate a stochastic policy.
        Args:
            action_dim (int): action dimension
            state_dim (int): state dimension
            buffer_size (int): how much memory is allocated to the ReplayMemoryClass
            action_samples (int): originally labelled K in the paper, represents how many
                actions should be sampled from the memory buffer
            mode (string): poorly named variable to represent variable being used in the
                distribution being used
            beta (float): value used in boltzmann distribution
            tau (float): update rate parameter
            batch_size (int): batch size
            q_normalization (float): q value normalization rate
            gamma (float): value used in critic training
            normalize_obs (boolean): boolean to indicate that you want to normalize
                observations
            normalize_rewards (boolean): boolean to indicate that you want to normalize
                return values (usually done for numerical stability)
            actor (string): string indicating the type of actor to use
        """
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.buffer_size = buffer_size
        self.gamma = gamma
        self.action_samples = action_samples
        self.mode = mode
        self.beta = beta
        self.tau = tau
        self.batch_size = batch_size
        self.step = 0

        # normalization
        self.normalize_observations = normalize_obs
        self.q_normalization = q_normalization
        self.normalize_rewards = normalize_rewards

        # Actor
        # type of actor being used
        if actor == 'IQN':
            self.actor = StochasticActor(self.state_dim, self.action_dim,
                                         'source')
            self.target_actor = StochasticActor(self.state_dim,
                                                self.action_dim, 'target')
        elif actor == 'AIQN':
            self.actor = AutoRegressiveStochasticActor(self.state_dim,
                                                       self.action_dim)
            self.target_actor = AutoRegressiveStochasticActor(
                self.state_dim, self.action_dim)

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd(shape=self.state_dim)
        else:
            self.obs_rms = None

        if self.normalize_rewards:
            self.ret_rms = RunningMeanStd(shape=1)
            self.ret = 0
        else:
            self.ret_rms = None

        # initialize trainable variables
        self.actor(tf.zeros([self.batch_size, self.state_dim]),
                   tf.zeros([self.batch_size, self.action_dim]))
        self.target_actor(tf.zeros([self.batch_size, self.state_dim]),
                          tf.zeros([self.batch_size, self.action_dim]))

        # Critic
        self.critics = Critic(self.state_dim, self.action_dim, 'source')
        self.target_critics = Critic(self.state_dim, self.action_dim, 'target')

        # initialize trainable variables for critics
        self.critics(tf.zeros([self.batch_size, self.state_dim]),
                     tf.zeros([self.batch_size, self.action_dim]))
        self.target_critics(tf.zeros([self.batch_size, self.state_dim]),
                            tf.zeros([self.batch_size, self.action_dim]))

        # Value
        self.value = Value(self.state_dim, 'source')
        self.target_value = Value(self.state_dim, 'target')

        # initialize value training variables
        self.value(tf.zeros([self.batch_size, self.state_dim]))
        self.value(tf.zeros([self.batch_size, self.state_dim]))

        # initialize the target networks.
        update(self.target_actor, self.actor, 1.0)
        update(self.target_critics, self.critics, 1.0)
        update(self.target_value, self.value, 1.0)

        self.replay = ReplayBuffer(self.state_dim, self.action_dim,
                                   self.buffer_size)
        self.action_sampler = ActionSampler(self.actor.action_dim)
Esempio n. 8
0
class GACAgent:
    """
    GAC agent.
    Action is always from -1 to 1 in each dimension.
    """
    def __init__(self,
                 action_dim,
                 state_dim,
                 buffer_size=1000000,
                 action_samples=10,
                 mode='linear',
                 beta=1,
                 tau=5e-3,
                 q_normalization=0.01,
                 gamma=0.99,
                 normalize_obs=False,
                 normalize_rewards=False,
                 batch_size=64,
                 actor='AIQN',
                 *args,
                 **kwargs):
        """
        Agent class to generate a stochastic policy.
        Args:
            action_dim (int): action dimension
            state_dim (int): state dimension
            buffer_size (int): how much memory is allocated to the ReplayMemoryClass
            action_samples (int): originally labelled K in the paper, represents how many
                actions should be sampled from the memory buffer
            mode (string): poorly named variable to represent variable being used in the
                distribution being used
            beta (float): value used in boltzmann distribution
            tau (float): update rate parameter
            batch_size (int): batch size
            q_normalization (float): q value normalization rate
            gamma (float): value used in critic training
            normalize_obs (boolean): boolean to indicate that you want to normalize
                observations
            normalize_rewards (boolean): boolean to indicate that you want to normalize
                return values (usually done for numerical stability)
            actor (string): string indicating the type of actor to use
        """
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.buffer_size = buffer_size
        self.gamma = gamma
        self.action_samples = action_samples
        self.mode = mode
        self.beta = beta
        self.tau = tau
        self.batch_size = batch_size
        self.step = 0

        # normalization
        self.normalize_observations = normalize_obs
        self.q_normalization = q_normalization
        self.normalize_rewards = normalize_rewards

        # Actor
        # type of actor being used
        if actor == 'IQN':
            self.actor = StochasticActor(self.state_dim, self.action_dim,
                                         'source')
            self.target_actor = StochasticActor(self.state_dim,
                                                self.action_dim, 'target')
        elif actor == 'AIQN':
            self.actor = AutoRegressiveStochasticActor(self.state_dim,
                                                       self.action_dim)
            self.target_actor = AutoRegressiveStochasticActor(
                self.state_dim, self.action_dim)

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd(shape=self.state_dim)
        else:
            self.obs_rms = None

        if self.normalize_rewards:
            self.ret_rms = RunningMeanStd(shape=1)
            self.ret = 0
        else:
            self.ret_rms = None

        # initialize trainable variables
        self.actor(tf.zeros([self.batch_size, self.state_dim]),
                   tf.zeros([self.batch_size, self.action_dim]))
        self.target_actor(tf.zeros([self.batch_size, self.state_dim]),
                          tf.zeros([self.batch_size, self.action_dim]))

        # Critic
        self.critics = Critic(self.state_dim, self.action_dim, 'source')
        self.target_critics = Critic(self.state_dim, self.action_dim, 'target')

        # initialize trainable variables for critics
        self.critics(tf.zeros([self.batch_size, self.state_dim]),
                     tf.zeros([self.batch_size, self.action_dim]))
        self.target_critics(tf.zeros([self.batch_size, self.state_dim]),
                            tf.zeros([self.batch_size, self.action_dim]))

        # Value
        self.value = Value(self.state_dim, 'source')
        self.target_value = Value(self.state_dim, 'target')

        # initialize value training variables
        self.value(tf.zeros([self.batch_size, self.state_dim]))
        self.value(tf.zeros([self.batch_size, self.state_dim]))

        # initialize the target networks.
        update(self.target_actor, self.actor, 1.0)
        update(self.target_critics, self.critics, 1.0)
        update(self.target_value, self.value, 1.0)

        self.replay = ReplayBuffer(self.state_dim, self.action_dim,
                                   self.buffer_size)
        self.action_sampler = ActionSampler(self.actor.action_dim)

    def train_one_step(self):
        """
        Execute one update for each of the networks. Note that if no positive advantage elements
        are returned the algorithm doesn't update the actor parameters.
        Args:
            None
        Returns:
            None
        """
        # transitions is sampled from replay buffer
        transitions = self.replay.sample_batch(self.batch_size)
        state_batch = normalize(transitions.s, self.obs_rms)
        action_batch = transitions.a
        reward_batch = normalize(transitions.r, self.ret_rms)
        next_state_batch = normalize(transitions.sp, self.obs_rms)
        terminal_mask = transitions.it
        # transitions is sampled from replay buffer

        # train critic and value
        self.critics.train(state_batch, action_batch, reward_batch,
                           next_state_batch, terminal_mask, self.target_value,
                           self.gamma, self.q_normalization)
        self.value.train(state_batch, self.target_actor, self.target_critics,
                         self.action_samples)

        # note that transitions.s represents the sampled states from the memory buffer
        states, actions, advantages = self._sample_positive_advantage_actions(
            state_batch)
        if advantages.shape[0]:
            self.actor.train(states, actions, advantages, self.mode, self.beta)

        update(self.target_actor, self.actor, self.tau)
        update(self.target_critics, self.critics, self.tau)
        update(self.target_value, self.value, self.tau)

        with self.actor.train_summary_writer.as_default():
            tf.summary.scalar('actor loss',
                              self.actor.train_loss.result(),
                              step=self.step)

        with self.critics.train_summary_writer.as_default():
            tf.summary.scalar('critic loss',
                              self.critics.train_loss.result(),
                              step=self.step)

        with self.value.train_summary_writer.as_default():
            tf.summary.scalar('value loss',
                              self.value.train_loss.result(),
                              step=self.step)

        self.step += 1

    def _sample_positive_advantage_actions(self, states):
        """
        Sample from the target network and a uniform distribution.
        Then only keep the actions with positive advantage.
        Returning one action per state, if more needed, make states contain the
        same state multiple times.
        Args:
            states (tf.Variable): states of dimension (batch_size, state_dim)
        Returns:
            good_states (list): Set of positive advantage states (batch_size, sate_dim)
            good_actions (list): Set of positive advantage actions
            advantages (list[float]): set of positive advantage values (Q - V)
        """

        # tile states to be of dimension (batch_size * K, state_dim)
        tiled_states = tf.tile(states, [self.action_samples, 1])

        # Sample actions with noise for regularization
        target_actions = self.action_sampler.get_actions(
            self.target_actor, tiled_states)
        target_actions += tf.random.normal(target_actions.shape) * 0.01
        target_actions = tf.clip_by_value(target_actions, -1, 1)
        target_q = self.target_critics(tiled_states, target_actions)

        # Sample multiple actions both from the target policy and from a uniform distribution
        # over the action space. These will be used to determine the target distribution
        random_actions = tf.random.uniform(target_actions.shape,
                                           minval=-1.0,
                                           maxval=1.0)
        random_q = self.target_critics(tiled_states, random_actions)

        # create target actions vector, consistent of purely random actions and noisy actions
        # for the sake of exploration
        target_actions = tf.concat([target_actions, random_actions], 0)

        # compute Q and V values with dimensions (2 * batch_size * K, 1)
        q = tf.concat([target_q, random_q], 0)

        # determine the estimated value of a given state
        v = self.target_value(tiled_states)
        v = tf.concat([v, v], 0)

        # expand tiled states to allow for indexing later on
        tiled_states = tf.concat([tiled_states, tiled_states], 0)

        # remove unused dimensions
        q_squeezed = tf.squeeze(q)
        v_squeezed = tf.squeeze(v)

        # select s, a with positive advantage
        squeezed_indicies = tf.where(q_squeezed > v_squeezed)

        # collect all advantegeous states and actions
        good_states = tf.gather_nd(tiled_states, squeezed_indicies)
        good_actions = tf.gather_nd(target_actions, squeezed_indicies)

        # retrieve advantage values
        advantages = tf.gather_nd(q - v, squeezed_indicies)
        return good_states, good_actions, advantages

    def get_action(self, states):
        """
        Get a set of actions for a batch of states
        Args:
            states (tf.Variable): dimensions (batch_size, state_dim)
        Returns:
            sampled actions for the given state with dimension (batch_size, action_dim)
        """
        return self.action_sampler.get_actions(self.actor, states)

    def select_perturbed_action(self, state, action_noise=None):
        """
        Select actions from the perturbed actor using action noise and parameter noise
        Args:
            state (tf.Variable): tf variable containing the state vector
            action_niose (function): action noise function which will construct noise from some
                distribution
        Returns:
            action vector of dimension (batch_size, action_dim). Note that if action noise,
                this function is the same as get_action.
        """
        state = normalize(tf.Variable(state, dtype=tf.float32), self.obs_rms)
        action = self.action_sampler.get_actions(self.actor, state)
        if action_noise is not None:
            action += tf.Variable(action_noise(), dtype=tf.float32)
        action = tf.clip_by_value(action, -1, 1)
        return action

    def store_transition(self, state, action, reward, next_state, is_done):
        """
        Store the transition in the replay buffer with normalizing, should it be specified.
        Args:
            state (tf.Variable): (batch_size, state_size) state vector
            action (tf.Variable): (batch_size, action_size) action vector
            reward (float): reward value determined by the environment (batch_size, 1)
            next_state (tf.Variable): (batch_size, state_size) next state vector
            is_done (boolean): value to indicate that the state is terminal
        """
        self.replay.store(state, action, reward, next_state, is_done)
        if self.normalize_observations:
            self.obs_rms.update(state)
        if self.normalize_rewards:
            self.ret = self.ret * self.gamma + reward
            self.ret_rms.update(np.array([self.ret]))
            if is_done:
                self.ret = 0
Esempio n. 9
0
def main():
    if 'NAME' in os.environ.keys():
        NAME = os.environ['NAME']
    else:
        raise ValueError('set NAME via env variable')

    try:
        env_settings = json.load(open(default_config['CarIntersectConfigPath'], 'r'))
    except:
        env_settings = yaml.load(open(default_config['CarIntersectConfigPath'], 'r'))

    if 'home-test' not in NAME:
        wandb.init(
            project='CarRacing_RND',
            reinit=True,
            name=f'rnd_{NAME}',
            config={'env_config': env_settings, 'agent_config': default_config},
        )

    # print({section: dict(config[section]) for section in config.sections()})
    train_method = default_config['TrainMethod']

    env_id = default_config['EnvID']
    # env_type = default_config['EnvType']

    # if env_type == 'mario':
    #     env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)
    # elif env_type == 'atari':
    #     env = gym.make(env_id)
    # else:
    #     raise NotImplementedError

    seed = np.random.randint(0, 2 ** 16 - 1)

    print(f'use name : {NAME}')
    print(f"use env config : {default_config['CarIntersectConfigPath']}")
    print(f'use seed : {seed}')
    print(f"use device : {os.environ['DEVICE']}")

    os.chdir('..')
    env = makeCarIntersect(env_settings)
    eval_env = create_eval_env(makeCarIntersect(env_settings))

    # input_size = env.observation_space.shape  # 4
    input_size = env.observation_space.shape
    assert isinstance(env.action_space, gym.spaces.Box)
    action_size = env.action_space.shape[0]  # 2

    env.close()

    is_load_model = True
    is_render = False
    # model_path = 'models/{}.model'.format(NAME)
    # predictor_path = 'models/{}.pred'.format(NAME)
    # target_path = 'models/{}.target'.format(NAME)

    # writer = SummaryWriter()

    use_cuda = default_config.getboolean('UseGPU')
    use_gae = default_config.getboolean('UseGAE')
    use_noisy_net = default_config.getboolean('UseNoisyNet')

    lam = float(default_config['Lambda'])
    num_worker = int(default_config['NumEnv'])

    num_step = int(default_config['NumStep'])

    ppo_eps = float(default_config['PPOEps'])
    epoch = int(default_config['Epoch'])
    mini_batch = int(default_config['MiniBatch'])
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = float(default_config['LearningRate'])
    entropy_coef = float(default_config['Entropy'])
    gamma = float(default_config['Gamma'])
    int_gamma = float(default_config['IntGamma'])
    clip_grad_norm = float(default_config['ClipGradNorm'])
    ext_coef = float(default_config['ExtCoef'])
    int_coef = float(default_config['IntCoef'])

    sticky_action = default_config.getboolean('StickyAction')
    action_prob = float(default_config['ActionProb'])
    life_done = default_config.getboolean('LifeDone')

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    pre_obs_norm_step = int(default_config['ObsNormStep'])
    discounted_reward = RewardForwardFilter(int_gamma)

    agent = RNDAgent(
        input_size,
        action_size,
        num_worker,
        num_step,
        gamma,
        lam=lam,
        learning_rate=learning_rate,
        ent_coef=entropy_coef,
        clip_grad_norm=clip_grad_norm,
        epoch=epoch,
        batch_size=batch_size,
        ppo_eps=ppo_eps,
        use_cuda=use_cuda,
        use_gae=use_gae,
        use_noisy_net=use_noisy_net,
        device=os.environ['DEVICE'],
    )

    # if is_load_model:
    #     print('load model...')
    #     if use_cuda:
    #         agent.model.load_state_dict(torch.load(model_path))
    #         agent.rnd.predictor.load_state_dict(torch.load(predictor_path))
    #         agent.rnd.target.load_state_dict(torch.load(target_path))
    #     else:
    #         agent.model.load_state_dict(torch.load(model_path, map_location='cpu'))
    #         agent.rnd.predictor.load_state_dict(torch.load(predictor_path, map_location='cpu'))
    #         agent.rnd.target.load_state_dict(torch.load(target_path, map_location='cpu'))
    #     print('load finished!')

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = AtariEnvironment(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob,
                        life_done=life_done, settings=env_settings)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    os.chdir('rnd_continues')

    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    logger = Logger(None, use_console=True, use_wandb=True, log_interval=1)

    print('Test evaluater:')
    evaluate_and_log(
        eval_env=eval_env,
        action_get_method=lambda eval_state: agent.get_action(
            np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255.
        )[0][0].cpu().numpy(),
        logger=logger,
        log_animation=False,
        exp_class='RND',
        exp_name=NAME,
        debug=True,
    )
    print('end evaluater test.')

    # normalize obs
    print('Start to initailize observation normalization parameter.....')

    # print('ALERT! pass section')
    # assert 'home-test' in NAME
    next_obs = []
    for step in range(num_step * pre_obs_norm_step):
        actions = np.random.uniform(-1, 1, size=(num_worker, action_size))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            next_obs.append(s[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (num_step * num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []
    print('End to initalize...')

    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy_log_prob, total_policy_log_prob_np = \
            [], [], [], [], [], [], [], [], [], [], []

        # Step 1. n-step rollout
        for _ in range(num_step):
            global_step += num_worker
            # actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.)
            actions, value_ext, value_int, policy_log_prob = agent.get_action(np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action.cpu().numpy())

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)
                next_obs.append(s[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = agent.compute_intrinsic_reward(
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions.cpu().numpy())
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)

            # total_policy.append(policy)
            # total_policy_np.append(policy.cpu().numpy())

            total_policy_log_prob.extend(policy_log_prob.cpu().numpy())

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                # writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
                # writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
                # writer.add_scalar('data/step', sample_step, sample_episode)
                logger.log_it({
                    'reward_per_episode': sample_rall,
                    'intrinsic_reward': sample_i_rall,
                    'episode_steps': sample_step,
                    'global_step_cnt': global_step,
                    'updates_cnt': global_update,
                })
                logger.publish_logs(step=global_step)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)

        # total_action = np.stack(total_action).transpose().reshape([-1, action_size])
        total_action = np.array(total_action).reshape((-1, action_size))
        # total_log_prob_old = np.array(total_policy_log_prob).reshape((-1))

        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        # total_logging_policy = np.vstack(total_policy_np)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in
                                         total_int_reward.T])
        mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std ** 2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        # writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode)
        # writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        # writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward,
                                              total_done,
                                              total_ext_values,
                                              gamma,
                                              num_step,
                                              num_worker)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values,
                                              int_gamma,
                                              num_step,
                                              num_worker)

        # add ext adv and int adv
        total_adv = int_adv * int_coef + ext_adv * ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        global_update += 1
        # Step 5. Training!
        agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action,
                          total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5),
                          total_policy_log_prob)

        # if global_step % (num_worker * num_step * 100) == 0:
        #     print('Now Global Step :{}'.format(global_step))
        #     torch.save(agent.model.state_dict(), model_path)
        #     torch.save(agent.rnd.predictor.state_dict(), predictor_path)
        #     torch.save(agent.rnd.target.state_dict(), target_path)

        if global_update % 100 == 0:
            evaluate_and_log(
                eval_env=eval_env,
                action_get_method=lambda eval_state: agent.get_action(
                    np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255.
                )[0][0].cpu().numpy(),
                logger=logger,
                log_animation=True,
                exp_class='RND',
                exp_name=NAME,
            )
            logger.publish_logs(step=global_step)
Esempio n. 10
0
class VecEnvNorm(BaseVecEnv):
    def __init__(self,
                 venv,
                 ob=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        self.venv = venv
        self._ob_space = venv.observation_space
        self._ac_space = venv.action_space
        self.ob_rms = RunningMeanStd(
            shape=self._ob_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step(self, vac):
        obs, rews, news, infos = self.venv.step(vac)
        self.ret = self.ret * self.gamma + rews
        # normalize observations
        obs = self._norm_ob(obs)
        # normalize rewards
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        return obs, rews, news, infos

    def _norm_ob(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        obs = self.venv.reset()
        return self._norm_ob(obs)

    def set_random_seed(self, seeds):
        for env, seed in zip(self.venv.envs, seeds):
            env.seed(int(seed))

    @property
    def action_space(self):
        return self._ac_space

    @property
    def observation_space(self):
        return self._ob_space

    def close(self):
        self.venv.close()

    def render(self):
        self.venv.render()

    @property
    def num_envs(self):
        return self.venv.num_envs
class RolloutStorage(object):
    def __init__(self,
                 num_steps,
                 num_processes,
                 obs_shape,
                 action_space,
                 recurrent_hidden_state_size,
                 norm_rew=False):
        self.obs = torch.zeros(num_steps + 1, num_processes, *obs_shape)
        self.recurrent_hidden_states = torch.zeros(
            num_steps + 1, num_processes, recurrent_hidden_state_size)
        self.rewards = torch.zeros(num_steps, num_processes, 1)
        self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
        self.returns = torch.zeros(num_steps + 1, num_processes, 1)
        self.action_log_probs = torch.zeros(num_steps, num_processes, 1)
        self.norm_rew = norm_rew

        if self.norm_rew:
            self.ret_running_mean_std = RunningMeanStd()

        if action_space.__class__.__name__ == 'Discrete':
            action_shape = 1
            self.n_actions = action_space.n
        else:
            action_shape = action_space.shape[0]
            self.n_actions = None
        self.actions = torch.zeros(num_steps, num_processes, action_shape)
        if action_space.__class__.__name__ == 'Discrete':
            self.actions = self.actions.long()
        self.masks = torch.ones(num_steps + 1, num_processes, 1)

        self.num_steps = num_steps
        self.step = 0

    def to(self, device):
        self.obs = self.obs.to(device)
        self.recurrent_hidden_states = self.recurrent_hidden_states.to(device)
        self.rewards = self.rewards.to(device)
        self.value_preds = self.value_preds.to(device)
        self.returns = self.returns.to(device)
        self.action_log_probs = self.action_log_probs.to(device)
        self.actions = self.actions.to(device)
        self.masks = self.masks.to(device)

    def insert(self, obs, recurrent_hidden_states, actions, action_log_probs,
               value_preds, rewards, masks):
        self.obs[self.step + 1].copy_(obs)
        self.recurrent_hidden_states[self.step +
                                     1].copy_(recurrent_hidden_states)
        self.actions[self.step].copy_(actions)
        self.action_log_probs[self.step].copy_(action_log_probs)
        self.value_preds[self.step].copy_(value_preds)
        self.rewards[self.step].copy_(rewards)
        self.masks[self.step + 1].copy_(masks)

        self.step = (self.step + 1) % self.num_steps

    def after_update(self):
        self.obs[0].copy_(self.obs[-1])
        self.recurrent_hidden_states[0].copy_(self.recurrent_hidden_states[-1])
        self.masks[0].copy_(self.masks[-1])

    def compute_returns(self, next_value, use_gae, gamma, tau):
        if self.norm_rew:
            # NOTE: Not adding the estimated value after last time step here
            r_gamma_sum = torch.zeros(self.returns.size()).to(
                self.returns.device)
            for step in reversed(range(self.rewards.size(0))):
                r_gamma_sum[step] = r_gamma_sum[step + 1] * \
                    gamma * self.masks[step + 1] + self.rewards[step]
            r_gamma_sum_flat = r_gamma_sum.view(-1)
            ret_mean = torch.mean(r_gamma_sum_flat).detach()
            ret_std = torch.std(r_gamma_sum_flat).detach()
            ret_count = r_gamma_sum_flat.shape[0]
            self.ret_running_mean_std.update_from_moments(
                ret_mean, ret_std**2, ret_count)
            self.rewards /= torch.sqrt(self.ret_running_mean_std.var)

        if use_gae:
            self.value_preds[-1] = next_value
            gae = 0
            for step in reversed(range(self.rewards.size(0))):
                delta = self.rewards[step] + gamma * self.value_preds[
                    step + 1] * self.masks[step + 1] - self.value_preds[step]
                gae = delta + gamma * tau * self.masks[step + 1] * gae
                self.returns[step] = gae + self.value_preds[step]
        else:
            self.returns[-1] = next_value
            for step in reversed(range(self.rewards.size(0))):
                self.returns[step] = self.returns[step + 1] * \
                    gamma * self.masks[step + 1] + self.rewards[step]

    def feed_forward_generator(self, advantages, num_mini_batch):
        num_steps, num_processes = self.rewards.size()[0:2]
        batch_size = num_processes * num_steps
        assert batch_size >= num_mini_batch, (
            "PPO requires the number of processes ({}) "
            "* number of steps ({}) = {} "
            "to be greater than or equal to the number of PPO mini batches ({})."
            "".format(num_processes, num_steps, num_processes * num_steps,
                      num_mini_batch))
        mini_batch_size = batch_size // num_mini_batch
        sampler = BatchSampler(SubsetRandomSampler(range(batch_size)),
                               mini_batch_size,
                               drop_last=False)
        for indices in sampler:
            obs_batch = self.obs[:-1].view(-1, *self.obs.size()[2:])[indices]
            recurrent_hidden_states_batch = self.recurrent_hidden_states[:-1].view(
                -1, self.recurrent_hidden_states.size(-1))[indices]
            actions_batch = self.actions.view(-1,
                                              self.actions.size(-1))[indices]
            return_batch = self.returns[:-1].view(-1, 1)[indices]
            masks_batch = self.masks[:-1].view(-1, 1)[indices]
            old_action_log_probs_batch = self.action_log_probs.view(-1,
                                                                    1)[indices]
            adv_targ = advantages.view(-1, 1)[indices]

            yield obs_batch, recurrent_hidden_states_batch, actions_batch, \
                return_batch, masks_batch, old_action_log_probs_batch, adv_targ, None, None

    def recurrent_generator(self, advantages, num_mini_batch):
        num_processes = self.rewards.size(1)
        assert num_processes >= num_mini_batch, (
            "PPO requires the number of processes ({}) "
            "to be greater than or equal to the number of "
            "PPO mini batches ({}).".format(num_processes, num_mini_batch))
        num_envs_per_batch = num_processes // num_mini_batch
        perm = torch.randperm(num_processes)
        for start_ind in range(0, num_processes, num_envs_per_batch):
            obs_batch = []
            recurrent_hidden_states_batch = []
            actions_batch = []
            return_batch = []
            masks_batch = []
            old_action_log_probs_batch = []
            adv_targ = []

            for offset in range(num_envs_per_batch):
                ind = perm[start_ind + offset]
                obs_batch.append(self.obs[:-1, ind])
                recurrent_hidden_states_batch.append(
                    self.recurrent_hidden_states[0:1, ind])
                actions_batch.append(self.actions[:, ind])
                return_batch.append(self.returns[:-1, ind])
                masks_batch.append(self.masks[:-1, ind])
                old_action_log_probs_batch.append(self.action_log_probs[:,
                                                                        ind])
                adv_targ.append(advantages[:, ind])

            T, N = self.num_steps, num_envs_per_batch
            # These are all tensors of size (T, N, -1)
            obs_batch = torch.stack(obs_batch, 1)
            actions_batch = torch.stack(actions_batch, 1)
            return_batch = torch.stack(return_batch, 1)
            masks_batch = torch.stack(masks_batch, 1)
            old_action_log_probs_batch = torch.stack(
                old_action_log_probs_batch, 1)
            adv_targ = torch.stack(adv_targ, 1)

            # States is just a (N, -1) tensor
            recurrent_hidden_states_batch = torch.stack(
                recurrent_hidden_states_batch, 1).view(N, -1)

            # Flatten the (T, N, ...) tensors to (T * N, ...)
            obs_batch = _flatten_helper(T, N, obs_batch)
            actions_batch = _flatten_helper(T, N, actions_batch)
            return_batch = _flatten_helper(T, N, return_batch)
            masks_batch = _flatten_helper(T, N, masks_batch)
            old_action_log_probs_batch = _flatten_helper(T, N, \
                    old_action_log_probs_batch)
            adv_targ = _flatten_helper(T, N, adv_targ)

            yield obs_batch, recurrent_hidden_states_batch, actions_batch, \
                return_batch, masks_batch, old_action_log_probs_batch, adv_targ, T, N
Esempio n. 12
0
    def __init__(self,
                 gamma,
                 tau,
                 hidden_size,
                 num_inputs,
                 action_space,
                 train_mode,
                 alpha,
                 replay_size,
                 normalize_obs=True,
                 normalize_returns=False,
                 critic_l2_reg=1e-2):
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
            torch.backends.cudnn.enabled = False
            self.Tensor = torch.cuda.FloatTensor
        else:
            self.device = torch.device('cpu')
            self.Tensor = torch.FloatTensor

        self.alpha = alpha
        self.train_mode = train_mode

        self.num_inputs = num_inputs
        self.action_space = action_space
        self.critic_l2_reg = critic_l2_reg

        self.actor = Actor(hidden_size, self.num_inputs,
                           self.action_space).to(self.device)
        self.adversary = Actor(hidden_size, self.num_inputs,
                               self.action_space).to(self.device)
        if self.train_mode:
            self.actor_target = Actor(hidden_size, self.num_inputs,
                                      self.action_space).to(self.device)
            self.actor_perturbed = Actor(hidden_size, self.num_inputs,
                                         self.action_space).to(self.device)
            self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

            self.critic = Critic(hidden_size, self.num_inputs,
                                 self.action_space).to(self.device)
            self.critic_target = Critic(hidden_size, self.num_inputs,
                                        self.action_space).to(self.device)
            self.critic_optim = Adam(self.critic.parameters(),
                                     lr=1e-3,
                                     weight_decay=critic_l2_reg)

            self.adversary_target = Actor(hidden_size, self.num_inputs,
                                          self.action_space).to(self.device)
            self.adversary_perturbed = Actor(hidden_size, self.num_inputs,
                                             self.action_space).to(self.device)
            self.adversary_optim = Adam(self.adversary.parameters(), lr=1e-4)

            hard_update(
                self.adversary_target,
                self.adversary)  # Make sure target is with the same weight
            hard_update(self.actor_target,
                        self.actor)  # Make sure target is with the same weight
            hard_update(self.critic_target, self.critic)

        self.gamma = gamma
        self.tau = tau
        self.normalize_observations = normalize_obs
        self.normalize_returns = normalize_returns

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd(shape=num_inputs)
        else:
            self.obs_rms = None

        if self.normalize_returns:
            self.ret_rms = RunningMeanStd(shape=1)
            self.ret = 0
            self.cliprew = 10.0
        else:
            self.ret_rms = None

        self.memory = ReplayMemory(replay_size)
Esempio n. 13
0
class DDPG:
    def __init__(self,
                 gamma,
                 tau,
                 hidden_size,
                 num_inputs,
                 action_space,
                 train_mode,
                 alpha,
                 replay_size,
                 normalize_obs=True,
                 normalize_returns=False,
                 critic_l2_reg=1e-2):
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
            torch.backends.cudnn.enabled = False
            self.Tensor = torch.cuda.FloatTensor
        else:
            self.device = torch.device('cpu')
            self.Tensor = torch.FloatTensor

        self.alpha = alpha
        self.train_mode = train_mode

        self.num_inputs = num_inputs
        self.action_space = action_space
        self.critic_l2_reg = critic_l2_reg

        self.actor = Actor(hidden_size, self.num_inputs,
                           self.action_space).to(self.device)
        self.adversary = Actor(hidden_size, self.num_inputs,
                               self.action_space).to(self.device)
        if self.train_mode:
            self.actor_target = Actor(hidden_size, self.num_inputs,
                                      self.action_space).to(self.device)
            self.actor_perturbed = Actor(hidden_size, self.num_inputs,
                                         self.action_space).to(self.device)
            self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

            self.critic = Critic(hidden_size, self.num_inputs,
                                 self.action_space).to(self.device)
            self.critic_target = Critic(hidden_size, self.num_inputs,
                                        self.action_space).to(self.device)
            self.critic_optim = Adam(self.critic.parameters(),
                                     lr=1e-3,
                                     weight_decay=critic_l2_reg)

            self.adversary_target = Actor(hidden_size, self.num_inputs,
                                          self.action_space).to(self.device)
            self.adversary_perturbed = Actor(hidden_size, self.num_inputs,
                                             self.action_space).to(self.device)
            self.adversary_optim = Adam(self.adversary.parameters(), lr=1e-4)

            hard_update(
                self.adversary_target,
                self.adversary)  # Make sure target is with the same weight
            hard_update(self.actor_target,
                        self.actor)  # Make sure target is with the same weight
            hard_update(self.critic_target, self.critic)

        self.gamma = gamma
        self.tau = tau
        self.normalize_observations = normalize_obs
        self.normalize_returns = normalize_returns

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd(shape=num_inputs)
        else:
            self.obs_rms = None

        if self.normalize_returns:
            self.ret_rms = RunningMeanStd(shape=1)
            self.ret = 0
            self.cliprew = 10.0
        else:
            self.ret_rms = None

        self.memory = ReplayMemory(replay_size)

    def eval(self):
        self.actor.eval()
        self.adversary.eval()
        if self.train_mode:
            self.critic.eval()

    def train(self):
        self.actor.train()
        self.adversary.train()
        if self.train_mode:
            self.critic.train()

    def select_action(self,
                      state,
                      action_noise=None,
                      param_noise=None,
                      mdp_type='mdp'):
        state = normalize(
            Variable(state).to(self.device), self.obs_rms, self.device)

        if mdp_type != 'mdp':
            if mdp_type == 'nr_mdp':
                if param_noise is not None:
                    mu = self.actor_perturbed(state)
                else:
                    mu = self.actor(state)
                mu = mu.data
                if action_noise is not None:
                    mu += self.Tensor(action_noise()).to(self.device)

                mu = mu.clamp(-1, 1) * (1 - self.alpha)

                if param_noise is not None:
                    adv_mu = self.adversary_perturbed(state)
                else:
                    adv_mu = self.adversary(state)

                adv_mu = adv_mu.data.clamp(-1, 1) * self.alpha

                mu += adv_mu
            else:  # mdp_type == 'pr_mdp':
                if np.random.rand() < (1 - self.alpha):
                    if param_noise is not None:
                        mu = self.actor_perturbed(state)
                    else:
                        mu = self.actor(state)
                    mu = mu.data
                    if action_noise is not None:
                        mu += self.Tensor(action_noise()).to(self.device)

                    mu = mu.clamp(-1, 1)
                else:
                    if param_noise is not None:
                        mu = self.adversary_perturbed(state)
                    else:
                        mu = self.adversary(state)

                    mu = mu.data.clamp(-1, 1)

        else:
            if param_noise is not None:
                mu = self.actor_perturbed(state)
            else:
                mu = self.actor(state)
            mu = mu.data
            if action_noise is not None:
                mu += self.Tensor(action_noise()).to(self.device)

            mu = mu.clamp(-1, 1)

        return mu

    def update_robust(self, state_batch, action_batch, reward_batch,
                      mask_batch, next_state_batch, adversary_update, mdp_type,
                      robust_update_type):
        # TRAIN CRITIC
        if robust_update_type == 'full':
            if mdp_type == 'nr_mdp':
                next_action_batch = (1 - self.alpha) * self.actor_target(next_state_batch) \
                                    + self.alpha * self.adversary_target(next_state_batch)

                next_state_action_values = self.critic_target(
                    next_state_batch, next_action_batch)
            else:  # mdp_type == 'pr_mdp':
                next_action_actor_batch = self.actor_target(next_state_batch)
                next_action_adversary_batch = self.adversary_target(
                    next_state_batch)

                next_state_action_values = self.critic_target(next_state_batch, next_action_actor_batch) * (
                            1 - self.alpha) \
                                           + self.critic_target(next_state_batch,
                                                                       next_action_adversary_batch) * self.alpha

            expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values

            self.critic_optim.zero_grad()

            state_action_batch = self.critic(state_batch, action_batch)

            value_loss = F.mse_loss(state_action_batch,
                                    expected_state_action_batch)
            value_loss.backward()
            self.critic_optim.step()
            value_loss = value_loss.item()
        else:
            value_loss = 0

        if adversary_update:
            # TRAIN ADVERSARY
            self.adversary_optim.zero_grad()

            if mdp_type == 'nr_mdp':
                with torch.no_grad():
                    real_action = self.actor_target(next_state_batch)
                action = (
                    1 - self.alpha
                ) * real_action + self.alpha * self.adversary(next_state_batch)
                adversary_loss = self.critic(state_batch, action)
            else:  # mdp_type == 'pr_mdp'
                action = self.adversary(next_state_batch)
                adversary_loss = self.critic(state_batch, action) * self.alpha

            adversary_loss = adversary_loss.mean()
            adversary_loss.backward()
            self.adversary_optim.step()
            adversary_loss = adversary_loss.item()
            policy_loss = 0
        else:
            if robust_update_type == 'full':
                # TRAIN ACTOR
                self.actor_optim.zero_grad()

                if mdp_type == 'nr_mdp':
                    with torch.no_grad():
                        adversary_action = self.adversary_target(
                            next_state_batch)
                    action = (1 - self.alpha) * self.actor(
                        next_state_batch) + self.alpha * adversary_action
                    policy_loss = -self.critic(state_batch, action)
                else:  # mdp_type == 'pr_mdp':
                    action = self.actor(next_state_batch)
                    policy_loss = -self.critic(state_batch, action) * (
                        1 - self.alpha)

                policy_loss = policy_loss.mean()
                policy_loss.backward()
                self.actor_optim.step()

                policy_loss = policy_loss.item()
                adversary_loss = 0
            else:
                policy_loss = 0
                adversary_loss = 0

        return value_loss, policy_loss, adversary_loss

    def update_non_robust(self, state_batch, action_batch, reward_batch,
                          mask_batch, next_state_batch):
        # TRAIN CRITIC

        next_action_batch = self.actor_target(next_state_batch)
        next_state_action_values = self.critic_target(next_state_batch,
                                                      next_action_batch)

        expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values

        self.critic_optim.zero_grad()

        state_action_batch = self.critic(state_batch, action_batch)

        value_loss = F.mse_loss(state_action_batch,
                                expected_state_action_batch)
        value_loss.backward()
        self.critic_optim.step()

        # TRAIN ACTOR

        self.actor_optim.zero_grad()

        action = self.actor(next_state_batch)

        policy_loss = -self.critic(state_batch, action)

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        policy_loss = policy_loss.item()
        adversary_loss = 0

        return value_loss.item(), policy_loss, adversary_loss

    def store_transition(self, state, action, mask, next_state, reward):
        B = state.shape[0]
        for b in range(B):
            self.memory.push(state[b], action[b], mask[b], next_state[b],
                             reward[b])
            if self.normalize_observations:
                self.obs_rms.update(state[b].cpu().numpy())
            if self.normalize_returns:
                self.ret = self.ret * self.gamma + reward[b]
                self.ret_rms.update(np.array([self.ret]))
                if mask[b] == 0:  # if terminal is True
                    self.ret = 0

    def update_parameters(self,
                          batch_size,
                          mdp_type='mdp',
                          adversary_update=False,
                          exploration_method='mdp'):
        transitions = self.memory.sample(batch_size)
        batch = Transition(*zip(*transitions))

        if mdp_type != 'mdp':
            robust_update_type = 'full'
        elif exploration_method != 'mdp':
            robust_update_type = 'adversary'
        else:
            robust_update_type = None

        state_batch = normalize(
            Variable(torch.stack(batch.state)).to(self.device), self.obs_rms,
            self.device)
        action_batch = Variable(torch.stack(batch.action)).to(self.device)
        reward_batch = normalize(
            Variable(torch.stack(batch.reward)).to(self.device).unsqueeze(1),
            self.ret_rms, self.device)
        mask_batch = Variable(torch.stack(batch.mask)).to(
            self.device).unsqueeze(1)
        next_state_batch = normalize(
            Variable(torch.stack(batch.next_state)).to(self.device),
            self.obs_rms, self.device)

        if self.normalize_returns:
            reward_batch = torch.clamp(reward_batch, -self.cliprew,
                                       self.cliprew)

        value_loss = 0
        policy_loss = 0
        adversary_loss = 0
        if robust_update_type is not None:
            _value_loss, _policy_loss, _adversary_loss = self.update_robust(
                state_batch, action_batch, reward_batch, mask_batch,
                next_state_batch, adversary_update, mdp_type,
                robust_update_type)
            value_loss += _value_loss
            policy_loss += _policy_loss
            adversary_loss += _adversary_loss
        if robust_update_type != 'full':
            _value_loss, _policy_loss, _adversary_loss = self.update_non_robust(
                state_batch, action_batch, reward_batch, mask_batch,
                next_state_batch)
            value_loss += _value_loss
            policy_loss += _policy_loss
            adversary_loss += _adversary_loss

        self.soft_update()

        return value_loss, policy_loss, adversary_loss

    def soft_update(self):
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.adversary_target, self.adversary, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def perturb_actor_parameters(self, param_noise):
        """Apply parameter noise to actor model, for exploration"""
        hard_update(self.actor_perturbed, self.actor)
        params = self.actor_perturbed.state_dict()
        for name in params:
            if 'ln' in name:
                pass
            param = params[name]
            param += torch.randn(param.shape).to(
                self.device) * param_noise.current_stddev
        """Apply parameter noise to adversary model, for exploration"""
        hard_update(self.adversary_perturbed, self.adversary)
        params = self.adversary_perturbed.state_dict()
        for name in params:
            if 'ln' in name:
                pass
            param = params[name]
            param += torch.randn(param.shape).to(
                self.device) * param_noise.current_stddev
Esempio n. 14
0
class Policy:
    def __init__(self,
                 gamma,
                 tau,
                 num_inputs,
                 action_space,
                 replay_size,
                 normalize_obs=True,
                 normalize_returns=False):
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
            torch.backends.cudnn.enabled = False
            self.Tensor = torch.cuda.FloatTensor
        else:
            self.device = torch.device('cpu')
            self.Tensor = torch.FloatTensor

        self.num_inputs = num_inputs
        self.action_space = action_space

        self.gamma = gamma
        self.tau = tau
        self.normalize_observations = normalize_obs
        self.normalize_returns = normalize_returns

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd(shape=num_inputs)
        else:
            self.obs_rms = None

        if self.normalize_returns:
            self.ret_rms = RunningMeanStd(shape=1)
            self.ret = 0
            self.cliprew = 10.0
        else:
            self.ret_rms = None

        self.memory = ReplayMemory(replay_size)
        self.actor = None
        self.actor_perturbed = None

    def eval(self):
        raise NotImplementedError

    def train(self):
        raise NotImplementedError

    def select_action(self, state, action_noise=None, param_noise=None):
        state = normalize(
            Variable(state).to(self.device), self.obs_rms, self.device)

        if param_noise is not None:
            action = self.policy(self.actor_perturbed, state)[0]
        else:
            action = self.policy(self.actor, state)[0]

        action = action.data
        if action_noise is not None:
            action += self.Tensor(action_noise()).to(self.device)

        action = action.clamp(-1, 1)

        return action

    def policy(self, actor, state):
        raise NotImplementedError

    def store_transition(self, state, action, mask, next_state, reward):
        B = state.shape[0]
        for b in range(B):
            self.memory.push(state[b], action[b], mask[b], next_state[b],
                             reward[b])
            if self.normalize_observations:
                self.obs_rms.update(state[b].cpu().numpy())
            if self.normalize_returns:
                self.ret = self.ret * self.gamma + reward[b]
                self.ret_rms.update(np.array([self.ret]))
                if mask[b] == 0:  # if terminal is True
                    self.ret = 0

    def update_critic(self, state_batch, action_batch, reward_batch,
                      mask_batch, next_state_batch):
        raise NotImplementedError

    def update_actor(self, state_batch):
        raise NotImplementedError

    def update_parameters(self, batch_size):
        transitions = self.memory.sample(batch_size)
        batch = Transition(*zip(*transitions))

        state_batch = normalize(
            Variable(torch.stack(batch.state)).to(self.device), self.obs_rms,
            self.device)
        action_batch = Variable(torch.stack(batch.action)).to(self.device)
        reward_batch = normalize(
            Variable(torch.stack(batch.reward)).to(self.device).unsqueeze(1),
            self.ret_rms, self.device)
        mask_batch = Variable(torch.stack(batch.mask)).to(
            self.device).unsqueeze(1)
        next_state_batch = normalize(
            Variable(torch.stack(batch.next_state)).to(self.device),
            self.obs_rms, self.device)

        if self.normalize_returns:
            reward_batch = torch.clamp(reward_batch, -self.cliprew,
                                       self.cliprew)

        value_loss = self.update_critic(state_batch, action_batch,
                                        reward_batch, mask_batch,
                                        next_state_batch)
        policy_loss = self.update_actor(state_batch)

        self.soft_update()

        return value_loss, policy_loss

    def soft_update(self):
        raise NotImplementedError

    def perturb_actor_parameters(self, param_noise):
        """Apply parameter noise to actor model, for exploration"""
        hard_update(self.actor_perturbed, self.actor)
        params = self.actor_perturbed.state_dict()
        for name in params:
            if 'ln' in name:
                pass
            param = params[name]
            param += torch.randn(param.shape).to(
                self.device) * param_noise.current_stddev

    def _tile(self, a, dim, n_tile):
        init_dim = a.size(dim)
        repeat_idx = [1] * a.dim()
        repeat_idx[dim] = n_tile
        a = a.repeat(*(repeat_idx))
        order_index = torch.LongTensor(
            np.concatenate([
                init_dim * np.arange(n_tile) + i for i in range(init_dim)
            ])).to(self.device)
        return torch.index_select(a, dim, order_index)
Esempio n. 15
0
    def __init__(self, beta, epsilon, learning_rate, gamma, tau, hidden_size_dim0, hidden_size_dim1, num_inputs, action_space, train_mode, alpha, replay_size,
                 optimizer, two_player, normalize_obs=True, normalize_returns=False, critic_l2_reg=1e-2):
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
            torch.backends.cudnn.enabled = False
            self.Tensor = torch.cuda.FloatTensor
        else:
            self.device = torch.device('cpu')
            self.Tensor = torch.FloatTensor

        self.alpha = alpha
        self.train_mode = train_mode

        self.num_inputs = num_inputs
        self.action_space = action_space
        self.critic_l2_reg = critic_l2_reg

        self.actor = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
        self.adversary = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
        if self.train_mode:
            self.actor_target = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            self.actor_bar = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            self.actor_outer = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            if(optimizer == 'SGLD'):
                self.actor_optim = SGLD(self.actor.parameters(), lr=1e-4, noise=epsilon, alpha=0.999)
            elif(optimizer == 'RMSprop'):
                self.actor_optim = RMSprop(self.actor.parameters(), lr=1e-4, alpha=0.999)
            else:
                self.actor_optim = ExtraAdam(self.actor.parameters(), lr=1e-4)

            self.critic = Critic(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            self.critic_target = Critic(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            self.critic_optim = Adam(self.critic.parameters(), lr=1e-3, weight_decay=critic_l2_reg)

            self.adversary_target = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            self.adversary_bar = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            self.adversary_outer = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device)
            if(optimizer == 'SGLD'):
                self.adversary_optim = SGLD(self.adversary.parameters(), lr=1e-4, noise=epsilon, alpha=0.999)
            elif(optimizer == 'RMSprop'):
                self.adversary_optim = RMSprop(self.adversary.parameters(), lr=1e-4, alpha=0.999)
            else:
                self.adversary_optim = ExtraAdam(self.adversary.parameters(), lr=1e-4)
			
            hard_update(self.adversary_target, self.adversary)  # Make sure target is with the same weight
            hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
            hard_update(self.critic_target, self.critic)

        self.gamma = gamma
        self.tau = tau
        self.beta = beta
        self.epsilon = epsilon
        self.learning_rate = learning_rate
        self.normalize_observations = normalize_obs
        self.normalize_returns = normalize_returns
        self.optimizer = optimizer
        self.two_player = two_player
        if self.normalize_observations:
            self.obs_rms = RunningMeanStd(shape=num_inputs)
        else:
            self.obs_rms = None

        if self.normalize_returns:
            self.ret_rms = RunningMeanStd(shape=1)
            self.ret = 0
            self.cliprew = 10.0
        else:
            self.ret_rms = None

        self.memory = ReplayMemory(replay_size)
Esempio n. 16
0
class Runner:
    def __init__(self,
                 env: Any,
                 agent: Any,
                 save_interval: int = 1000,
                 train_episode: int = 10**9,
                 num_eval_episode: int = 3,
                 episode_len: int = 3000,
                 pre_step: int = 10000,
                 gamma: float = 0.995,
                 int_gamma: float = 0.995,
                 lam: float = 0.97,
                 device=torch.device('cpu'),
                 int_coef: float = 1,
                 ext_coef: float = 0.3,
                 eval_interval: int = 10**4,
                 seed: int = 0):
        self.save_interval = save_interval
        self.eval_interval = eval_interval
        # prepare envs
        self.env = env
        self.env.seed(seed)
        self.env_test = deepcopy(env)
        self.env_test.seed(2**31 - seed)
        self.agent = agent

        # pepare steps
        self.global_step = 0
        self.step_in_episode = 0
        self.episode_so_far = 0

        self.episode_len = episode_len  # length of an episode
        self.num_eval_episode = num_eval_episode
        self.train_episode = train_episode
        self.pre_step = pre_step  # number of steps used to measure variance of states
        self.reward_rms = RunningMeanStd()
        obs_sampled = self.env.reset()
        self.obs_rms = RunningMeanStd(shape=[1] + list(obs_sampled.shape))
        self.device = device
        self.lam = lam
        self.gamma = gamma
        self.int_gamma = int_gamma  # gamma for intrinsic reward
        # ratio of intrinsic and extrinsic rewards
        self.int_coef = int_coef
        self.ext_coef = ext_coef
        self.reward_in_episode = 0.0
        self.returns = {'step': [], 'return': []}

    def run_episode(self):
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy = \
            [], [], [], [], [], [], [], [], [], []
        self.step_in_episode = 0
        self.reward_in_episode = 0
        obs = self.env.reset()
        done = False

        for _ in range(self.episode_len):
            action, policy, value_ext, value_int = self.agent.get_action(obs)
            obs_next, reward, done, info = env.step(2 * action)
            self.reward_in_episode += reward
            self.global_step += 1
            self.step_in_episode += 1
            int_reward = agent.calc_intrinsic_reward(
                (obs_next - self.obs_rms.mean) /
                np.sqrt(self.obs_rms.var).clip(-5, 5))

            total_next_obs.append(obs_next)
            total_int_reward.append(int_reward)
            total_state.append(obs)
            total_reward.append(reward)
            total_done.append(done)
            total_action.append(action)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_policy.append(policy)
            obs = obs_next

        _, _, value_ext, value_int = agent.get_action(obs)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)

        total_state = np.stack(total_state)  # (num_episode, state_shape)
        total_action = np.stack(total_action)  # (num_episode)
        total_done = np.stack(total_done)  # (num_episode, )
        total_next_obs = np.stack(total_next_obs)  # (num_episode, state_shape)
        total_int_reward = np.stack(total_int_reward)

        # normalize intrinsic reward
        mean, std, count = np.mean(total_reward), np.std(total_reward), len(
            total_reward)
        self.reward_rms.update_from_moments(mean, std**2, count)
        total_int_reward /= self.reward_rms.var

        ext_target, ext_adv = self.gae(reward=total_reward,
                                       done=total_done,
                                       value=total_ext_values,
                                       gamma=self.gamma,
                                       num_step=self.episode_len)
        int_target, int_adv = self.gae(reward=total_int_reward,
                                       done=[0] * self.episode_len,
                                       value=total_int_values,
                                       gamma=self.int_gamma,
                                       num_step=self.episode_len)
        total_adv = int_adv * self.int_coef + ext_adv * self.ext_coef
        self.obs_rms.update(total_next_obs)
        agent.train_model(
            states=np.float32(total_state),
            target_ext=ext_target,
            target_int=int_target,
            actions=total_action,
            advs=total_adv,
            next_states=((total_next_obs - self.obs_rms.mean) /
                         np.sqrt(self.obs_rms.var)).clip(-5, 5),
            log_pi_old=total_policy,  # TODO: fix this
            num_step=self.episode_len)

    def evaluate(self, steps):
        """ 複数エピソード環境を動かし,平均収益を記録する. """

        returns = []
        for _ in range(self.num_eval_episode):
            state = self.env_test.reset()
            done = False
            episode_return = 0.0
            step = 0
            while (not done):
                step += 1
                action = self.agent.exploit(state)
                state, reward, done, _ = self.env_test.step(2 * action)
                episode_return += reward

            returns.append(episode_return)

        mean_return = np.mean(returns)
        self.returns['step'].append(steps)
        self.returns['return'].append(mean_return)

        print(f'Num steps: {steps:<6}   '
              f'Num episode: {self.episode_so_far}   '
              f'Return: {mean_return:<5.1f}   '
              f'Time: {self.time}')

    def plot(self):
        """ 平均収益のグラフを描画する. """
        fig = plt.figure(figsize=(8, 6))
        plt.plot(self.returns['step'], self.returns['return'])
        plt.xlabel('Steps', fontsize=24)
        plt.ylabel('Return', fontsize=24)
        plt.tick_params(labelsize=18)
        plt.title(f'{self.env.unwrapped.spec.id}', fontsize=24)
        plt.tight_layout()
        plt.savefig('figure.png')

    def start(self):
        self.start_time = time()
        self.prepare_normalization_coeff()
        print('Start Training')
        for episode in range(self.train_episode):
            self.episode_so_far = episode
            self.run_episode()
            if episode % self.eval_interval:
                self.evaluate(steps=self.global_step)
            if episode % (self.eval_interval * 10):
                self.plot()
        print('Finished')

    @property
    def time(self):
        return str(timedelta(seconds=int(time() - self.start_time)))

    def prepare_normalization_coeff(self):
        states = []
        for _ in range(self.pre_step):
            action = self.env.action_space.sample()
            state, reward, done, info = self.env.step(action)
            states.append(state)
        states = np.array(states)
        self.obs_rms.update(states)

    def gae(self, reward: Sequence, done: Sequence, value: Sequence,
            gamma: float, num_step: int):
        """Returns (discounted_return, advantage)"""
        adv_tmp = 0
        discounted_return = [None] * num_step
        for t in range(num_step - 1, -1, -1):
            delta = reward[t] + gamma * value[t + 1] * (1 - done[t]) - value[t]
            adv_tmp = delta + gamma * self.lam * (1 - done[t]) * adv_tmp
            discounted_return[t] = adv_tmp + value[t]
        discounted_return = np.array(discounted_return, dtype='float32')
        adv = discounted_return - np.array(value[:-1], dtype='float32')
        return discounted_return, adv
Esempio n. 17
0
def main():
    args = get_args()
    device = torch.device('cuda' if args.cuda else 'cpu')

    env = gym.make(args.env_name)

    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in args.env_name:
        output_size -= 1

    env.close()

    is_render = False
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    model_path = os.path.join(args.save_dir, args.env_name + '.model')
    predictor_path = os.path.join(args.save_dir, args.env_name + '.pred')
    target_path = os.path.join(args.save_dir, args.env_name + '.target')

    writer = SummaryWriter(log_dir=args.log_dir)

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    discounted_reward = RewardForwardFilter(args.ext_gamma)

    model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net)
    rnd = RNDModel(input_size, output_size)
    model = model.to(device)
    rnd = rnd.to(device)
    optimizer = optim.Adam(list(model.parameters()) +
                           list(rnd.predictor.parameters()),
                           lr=args.lr)

    if args.load_model:
        if args.cuda:
            model.load_state_dict(torch.load(model_path))
        else:
            model.load_state_dict(torch.load(model_path, map_location='cpu'))

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(args.num_worker):
        parent_conn, child_conn = Pipe()
        work = AtariEnvironment(args.env_name,
                                is_render,
                                idx,
                                child_conn,
                                sticky_action=args.sticky_action,
                                p=args.sticky_action_prob,
                                max_episode_steps=args.max_episode_steps)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([args.num_worker, 4, 84, 84])

    sample_env_index = 0  # Sample Environment index to log
    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize observation
    print('Initializes observation normalization...')
    next_obs = []
    for step in range(args.num_step * args.pre_obs_norm_steps):
        actions = np.random.randint(0, output_size, size=(args.num_worker, ))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            next_state, reward, done, realdone, log_reward = parent_conn.recv()
            next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (args.num_step * args.num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []

    print('Training...')
    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], []
        global_step += (args.num_worker * args.num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(args.num_step):
            actions, value_ext, value_int, action_probs = get_action(
                model, device,
                np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                next_state, reward, done, real_done, log_reward = parent_conn.recv(
                )
                next_states.append(next_state)
                rewards.append(reward)
                dones.append(done)
                real_dones.append(real_done)
                log_rewards.append(log_reward)
                next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = compute_intrinsic_reward(
                rnd, device,
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_index]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_action_probs.append(action_probs)

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_index]

            sample_step += 1
            if real_dones[sample_env_index]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall,
                                  sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall,
                                  global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = get_action(model, device,
                                                np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape(
            [-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose(
            [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_action_probs = np.vstack(total_action_probs)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([
            discounted_reward.update(reward_per_step)
            for reward_per_step in total_int_reward.T
        ])
        mean, std, count = np.mean(total_reward_per_env), np.std(
            total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std**2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi',
                          np.sum(total_int_reward) / args.num_worker,
                          sample_episode)
        writer.add_scalar('data/int_reward_per_rollout',
                          np.sum(total_int_reward) / args.num_worker,
                          global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob',
                          total_logging_action_probs.max(1).mean(),
                          sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward, total_done,
                                              total_ext_values, args.ext_gamma,
                                              args.gae_lambda, args.num_step,
                                              args.num_worker, args.use_gae)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values, args.int_gamma,
                                              args.gae_lambda, args.num_step,
                                              args.num_worker, args.use_gae)

        # add ext adv and int adv
        total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        train_model(args, device, output_size, model, rnd, optimizer,
                    np.float32(total_state) / 255., ext_target, int_target,
                    total_action, total_adv,
                    ((total_next_obs - obs_rms.mean) /
                     np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs)

        if global_step % (args.num_worker * args.num_step *
                          args.save_interval) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(model.state_dict(), model_path)
            torch.save(rnd.predictor.state_dict(), predictor_path)
            torch.save(rnd.target.state_dict(), target_path)
Esempio n. 18
0
def main():
    args = get_args()
    device = torch.device('cuda' if args.cuda else 'cpu')
    seed = np.random.randint(0, 100)

    env = ObstacleTowerEnv('../ObstacleTower/obstacletower', worker_id=seed,
                               retro=True, config={'total-floors': 12}, greyscale=True, timeout_wait=300)
    env._flattener = ActionFlattener([2, 3, 2, 1])
    env._action_space = env._flattener.action_space
    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    env.close()

    is_render = False
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    model_path = os.path.join(args.save_dir, 'main.model')
    predictor_path = os.path.join(args.save_dir, 'main.pred')
    target_path = os.path.join(args.save_dir, 'main.target')

    writer = SummaryWriter()#log_dir=args.log_dir)



    discounted_reward = RewardForwardFilter(args.ext_gamma)

    model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net)
    rnd = RNDModel(input_size, output_size)
    model = model.to(device)
    rnd = rnd.to(device)
    optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr)
   
    if args.load_model:
        "Loading model..."
        if args.cuda:
            model.load_state_dict(torch.load(model_path))
        else:
            model.load_state_dict(torch.load(model_path, map_location='cpu'))


    works = []
    parent_conns = []
    child_conns = []
    for idx in range(args.num_worker):
        parent_conn, child_conn = Pipe()
        work = AtariEnvironment(
            args.env_name,
            is_render,
            idx,
            child_conn,
            sticky_action=args.sticky_action,
            p=args.sticky_action_prob,
            max_episode_steps=args.max_episode_steps)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([args.num_worker, 4, 84, 84])

    sample_env_index = 0   # Sample Environment index to log
    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    print("Load RMS =", args.load_rms)
    if args.load_rms:
        print("Loading RMS values for observation and reward normalization")
        with open('reward_rms.pkl', 'rb') as f:
            reward_rms = dill.load(f)
        with open('obs_rms.pkl', 'rb') as f:
            obs_rms = dill.load(f)
    else:
        reward_rms = RunningMeanStd()
        obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))

        # normalize observation
        print('Initializing observation normalization...')
        next_obs = []
        for step in range(args.num_step * args.pre_obs_norm_steps):
            actions = np.random.randint(0, output_size, size=(args.num_worker,))

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            for parent_conn in parent_conns:
                next_state, reward, done, realdone, log_reward = parent_conn.recv()
                next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

            if len(next_obs) % (args.num_step * args.num_worker) == 0:
                next_obs = np.stack(next_obs)
                obs_rms.update(next_obs)
                next_obs = []
        with open('reward_rms.pkl', 'wb') as f:
            dill.dump(reward_rms, f)
        with open('obs_rms.pkl', 'wb') as f:
            dill.dump(obs_rms, f)

    print('Training...')
    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], []
        global_step += (args.num_worker * args.num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(args.num_step):
            actions, value_ext, value_int, action_probs = get_action(model, device, np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                next_state, reward, done, real_done, log_reward = parent_conn.recv()
                next_states.append(next_state)
                rewards.append(reward)
                dones.append(done)
                real_dones.append(real_done)
                log_rewards.append(log_reward)
                next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = compute_intrinsic_reward(rnd, device,
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_index]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_action_probs.append(action_probs)

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_index]

            sample_step += 1
            if real_dones[sample_env_index]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_action_probs = np.vstack(total_action_probs)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T])
        mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std ** 2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode)
        writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward,
                                              total_done,
                                              total_ext_values,
                                              args.ext_gamma,
                                              args.gae_lambda,
                                              args.num_step,
                                              args.num_worker,
                                              args.use_gae)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values,
                                              args.int_gamma,
                                              args.gae_lambda,
                                              args.num_step,
                                              args.num_worker,
                                              args.use_gae)

        # add ext adv and int adv
        total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        train_model(args, device, output_size, model, rnd, optimizer,
                        np.float32(total_state) / 255., ext_target, int_target, total_action,
                        total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5),
                        total_action_probs)

        if global_step % (args.num_worker * args.num_step * args.save_interval) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(model.state_dict(), model_path)
            torch.save(rnd.predictor.state_dict(), predictor_path)
            torch.save(rnd.target.state_dict(), target_path)

            """
            checkpoint_list = np.array([int(re.search(r"\d+(\.\d+)?", x)[0]) for x in glob.glob(os.path.join('trained_models', args.env_name+'*.model'))])
            if len(checkpoint_list) == 0:
                last_checkpoint = -1
            else:
                last_checkpoint = checkpoint_list.max()
            next_checkpoint = last_checkpoint + 1
            print("Latest Checkpoint is #{}, saving checkpoint is #{}.".format(last_checkpoint, next_checkpoint))

            incre_model_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.model')
            incre_predictor_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.pred')
            incre_target_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.target')
            with open(incre_model_path, 'wb') as f:
                torch.save(model.state_dict(), f)
            with open(incre_predictor_path, 'wb') as f:
                torch.save(rnd.predictor.state_dict(), f)
            with open(incre_target_path, 'wb') as f:
                torch.save(rnd.target.state_dict(), f)
            """
            if args.terminate and (global_step > args.terminate_steps):
                with open('reward_rms.pkl', 'wb') as f:
                    dill.dump(reward_rms, f)
                with open('obs_rms.pkl', 'wb') as f:
                    dill.dump(obs_rms, f)
                break
Esempio n. 19
0
def main():

    args = parse_arguments()

    train_method = args.train_method
    env_id = args.env_id
    env_type = args.env_type

    if env_type == 'atari':
        env = gym.make(env_id)
        input_size = env.observation_space.shape 
        output_size = env.action_space.n 
        env.close()
    else:
        raise NotImplementedError

    is_load_model = False
    is_render = False
    os.makedirs('models', exist_ok=True)
    model_path = 'models/{}.model'.format(env_id)
    predictor_path = 'models/{}.pred'.format(env_id)
    target_path = 'models/{}.target'.format(env_id)

    results_dir = os.path.join('outputs', args.env_id)
    os.makedirs(results_dir, exist_ok=True)
    logger = Logger(results_dir)
    writer = SummaryWriter(os.path.join(results_dir, 'tensorboard', args.env_id))   

    use_cuda = args.use_gpu
    use_gae = args.use_gae
    use_noisy_net = args.use_noisynet
    lam = args.lam
    num_worker = args.num_worker
    num_step = args.num_step
    ppo_eps = args.ppo_eps
    epoch = args.epoch
    mini_batch = args.minibatch 
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = args.learning_rate
    entropy_coef = args.entropy
    gamma = args.gamma
    int_gamma = args.int_gamma
    clip_grad_norm = args.clip_grad_norm
    ext_coef = args.ext_coef
    int_coef = args.int_coef
    sticky_action = args.sticky_action
    action_prob = args.action_prob
    life_done = args.life_done
    pre_obs_norm_step = args.obs_norm_step

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    discounted_reward = RewardForwardFilter(int_gamma)

    if args.train_method == 'RND':
        agent = RNDAgent
    else:
        raise NotImplementedError

    if args.env_type == 'atari':
        env_type = AtariEnvironment
    else:
        raise NotImplementedError

    agent = agent(
        input_size,
        output_size,
        num_worker,
        num_step,
        gamma,
        lam=lam,
        learning_rate=learning_rate,
        ent_coef=entropy_coef,
        clip_grad_norm=clip_grad_norm,
        epoch=epoch,
        batch_size=batch_size,
        ppo_eps=ppo_eps,
        use_cuda=use_cuda,
        use_gae=use_gae,
        use_noisy_net=use_noisy_net
    )

    logger.info('Start to initialize workers')
    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(env_id, is_render, idx, child_conn, 
            sticky_action=sticky_action, p=action_prob, life_done=life_done, 
            max_step_per_episode=args.max_step_per_episode)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize obs
    logger.info('Start to initailize observation normalization parameter.....')
    next_obs = []
    for step in range(num_step * pre_obs_norm_step):
        actions = np.random.randint(0, output_size, size=(num_worker,))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            next_obs.append(s[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (num_step * num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []
    logger.info('End to initalize...')

    pbar = tqdm.tqdm(total=args.total_frames)
    while True:
        logger.info('Iteration: {}'.format(global_update))
        total_state, total_reward, total_done, total_next_state, \
            total_action, total_int_reward, total_next_obs, total_ext_values, \
            total_int_values, total_policy, total_policy_np = \
            [], [], [], [], [], [], [], [], [], [], []
        global_step += (num_worker * num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(num_step):
            actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = \
                [], [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)
                next_obs.append(s[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = agent.compute_intrinsic_reward(
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_policy.append(policy)
            total_policy_np.append(policy.cpu().numpy())

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                writer.add_scalar('data/returns_vs_frames', sample_rall, global_step)
                writer.add_scalar('data/lengths_vs_frames', sample_step, global_step)
                writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_policy = np.vstack(total_policy_np)
        
        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in
                                         total_int_reward.T])
        mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std ** 2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode)
        writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update)

        # logging Max action probability
        writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward, total_done, 
            total_ext_values, gamma, num_step, num_worker)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward),
            total_int_values, int_gamma, num_step, num_worker)

        # add ext adv and int adv
        total_adv = int_adv * int_coef + ext_adv * ext_coef

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)

        # Step 5. Training!
        agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action,
                          total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5),
                          total_policy)

        if args.save_models and global_update % 1000 == 0:
            torch.save(agent.model.state_dict(), 'models/{}-{}.model'.format(env_id, global_update))
            logger.info('Now Global Step :{}'.format(global_step))
            torch.save(agent.model.state_dict(), model_path)
            torch.save(agent.rnd.predictor.state_dict(), predictor_path)
            torch.save(agent.rnd.target.state_dict(), target_path)

        pbar.update(num_worker * num_step)
        if global_step >= args.total_frames:
            break

    pbar.close()
Esempio n. 20
0
class Agent(object):
    def __init__(self, env, policy, rnd, replay_buffer, logger, args):
        self.env = env
        # Models
        self.policy = policy
        self.rnd = rnd
        # Utils
        self.replay_buffer = replay_buffer
        self.logger = logger

        self.obs_running_mean = RunningMeanStd((84, 84, 1))
        self.rew_running_mean = RunningMeanStd(())

        self.last_enc_loss = None
        self.train_enc_next_itr = False

        # Args
        self.use_encoder = args['use_encoder']
        self.encoder_train_limit = args['encoder_train_limit']

        self.num_random_samples = args['num_random_samples']
        self.log_rate = args['log_rate']

    def set_session(self, sess):
        self.sess = sess
        self.policy.set_session(sess)
        self.rnd.set_sess(sess)

    def batch(self, eo, a, er, ir, en, d, batch_size, shuffle=True):
        if shuffle:
            indxs = np.arange(len(eo))
            np.random.shuffle(indxs)
            eo, a, er, ir, en, d = np.array(eo)[indxs], \
                np.array(a)[indxs], np.array(er)[indxs], np.array(ir)[indxs], \
                np.array(en)[indxs], np.array(d)[indxs]

        # batch up data
        batched_dsets = []
        for dset in [eo, a, er, ir, en, d]:
            bdset = []
            for i in range(0, len(dset), batch_size):
                bdset.append(np.array(dset[i:i + batch_size]))
            batched_dsets.append(np.array(bdset))
        return tuple(batched_dsets)

    # quick copy paste of sample_env
    def record(self, num_samples):
        done, i = False, 0
        n_lives, ignore = 6, 0
        obs_n, act_n, ext_rew_n, int_rew_n, n_obs_n, dones_n = [], [], [], [], [], []

        obs = self.env.reset()
        while not done and i < num_samples:
            act = self.policy.sample([obs])
            n_obs, rew, done, info = self.env.step(act)

            rnd_obs = ((n_obs - self.obs_running_mean.mean) /
                       np.sqrt(self.obs_running_mean.var))
            rnd_obs = np.clip(rnd_obs, -5, 5)
            int_rew = self.rnd.get_rewards([rnd_obs])[0]

            if info['ale.lives'] != n_lives:
                ignore = 18
                n_lives -= 1
            if not ignore:
                i += 1
                obs_n.append(obs)
                ext_rew_n.append(rew)
                n_obs_n.append(n_obs)
                act_n.append(act)
                dones_n.append(done)
                int_rew_n.append(int_rew)
                if done:
                    obs = self.env.reset()
                    done = True
                    n_lives, ignore = 6, 0
            else:
                ignore -= 1
        self.logger.log('env', ['int_rewards', 'ext_rewards'],
                        [int_rew_n, ext_rew_n])
        return int_rew_n, ext_rew_n, obs_n

    def sample_env(self,
                   batch_size,
                   num_samples,
                   shuffle,
                   algorithm='algorithm'):
        done, i = False, 0
        n_lives, ignore = 6, 0
        obs_n, act_n, ext_rew_n, int_rew_n, n_obs_n, dones_n = [], [], [], [], [], []

        # policy rollout
        obs = self.env.reset()
        while not done and i < num_samples:
            if algorithm == 'algorithm' and ignore < 0:
                act = self.policy.sample([obs])
            else:  # algorithm == 'random'
                act = self.env.action_space.sample()

            n_obs, rew, done, info = self.env.step(act)

            # format obs
            rnd_obs = ((n_obs - self.obs_running_mean.mean) /
                       np.sqrt(self.obs_running_mean.var))
            rnd_obs = np.clip(rnd_obs, -5, 5)
            int_rew = self.rnd.get_rewards([rnd_obs])[0]

            # dont record when agent dies
            if info['ale.lives'] != n_lives:
                ignore = 18
                n_lives -= 1
            if not ignore:
                i += 1
                self.rew_running_mean.update(np.array([int_rew]))

                obs_n.append(obs)
                ext_rew_n.append(rew)
                n_obs_n.append(n_obs)
                act_n.append(act)
                dones_n.append(done)
                int_rew_n.append(int_rew)
                if done:
                    obs = self.env.reset()
                    done = True
                    n_lives, ignore = 6, 0
            else:
                ignore -= 1

            obs = n_obs

        # log before normalization
        self.logger.log('env', ['int_rewards', 'ext_rewards'],
                        [int_rew_n, ext_rew_n])

        # normalize
        int_rew_n = (int_rew_n - self.rew_running_mean.mean) / np.sqrt(
            self.rew_running_mean.var)
        ext_rew_n = np.clip(ext_rew_n, -1, 1)

        self.obs_running_mean.update(np.array(obs_n))

        return obs_n, act_n, ext_rew_n, int_rew_n, n_obs_n, dones_n

    def get_data(self, batch_size, num_samples, itr):
        if itr < self.num_random_samples:
            return self.sample_env(batch_size,
                                   num_samples,
                                   shuffle=True,
                                   algorithm='random')
        return self.sample_env(batch_size, num_samples, shuffle=True)

    def init_obsmean(self):
        obs, done = self.env.reset(), False
        while not done:
            act = self.env.action_space.sample()
            obs, _, done, _ = self.env.step(act)
            self.obs_running_mean.update(obs)

    def init_encoder(self, batch_size, num_samples, loss_threshold):
        threshold_met, i = False, 0
        losses = []

        while not threshold_met and i < self.encoder_train_limit:
            raw_enc_obs, raw_act_n, raw_ext_rew_n, raw_int_rew, raw_enc_n_obs, raw_dones_n = self.sample_env(
                batch_size, num_samples, shuffle=True, algorithm='random')
            for _ in range(4):
                enc_obs, act_n, _, _, enc_n_obs, _ = self.batch(raw_enc_obs,
                                                                raw_act_n,
                                                                raw_ext_rew_n,
                                                                raw_int_rew,
                                                                raw_enc_n_obs,
                                                                raw_dones_n,
                                                                batch_size,
                                                                shuffle=True)
                for b_eobs, b_acts, b_enobs in zip(enc_obs, act_n, enc_n_obs):

                    enc_loss = self.policy.train_acthead(
                        b_eobs, b_enobs, b_acts)
                    losses.append(np.mean(enc_loss))
                    self.logger.log('encoder', ['loss'], [np.mean(enc_loss)])
                    i += 1

                if np.mean(losses) < loss_threshold: threshold_met = True
                losses = []

        if threshold_met: print('Encoder init threshold was met...')
        else: print('Encoder init threshold was NOT met...')

    def train(self, batch_size, num_samples, encoder_loss_thresh, itr, writer):
        raw_enc_obs, raw_act_n, raw_ext_rew_n, raw_int_rew, raw_enc_n_obs, raw_dones_n = self.get_data(
            batch_size, num_samples, itr)

        for _ in range(4):
            # reshuffle and batch
            enc_obs, act_n, ext_rew_n, int_rew, enc_n_obs, dones_n = self.batch(
                raw_enc_obs,
                raw_act_n,
                raw_ext_rew_n,
                raw_int_rew,
                raw_enc_n_obs,
                raw_dones_n,
                batch_size,
                shuffle=True)
            for b_eobs, b_acts, b_erew, b_irew, b_enobs, b_dones in zip(
                    enc_obs, act_n, ext_rew_n, int_rew, enc_n_obs, dones_n):

                # norm and clip for rnd
                rnd_obs = (b_eobs - self.obs_running_mean.mean
                           ) / self.obs_running_mean.var
                rnd_obs = np.clip(rnd_obs, -5, 5)
                rnd_loss = self.rnd.train(rnd_obs)

                total_r = b_erew + b_irew

                # norm for policy
                ac_obs, ac_n_obs = b_eobs / 255., b_enobs / 255.

                critic_loss = self.policy.train_critic(ac_obs, ac_n_obs,
                                                       total_r, b_dones)
                adv = self.policy.estimate_adv(ac_obs, total_r, ac_n_obs,
                                               b_dones)
                actor_loss, summ = self.policy.train_actor(ac_obs, b_acts, adv)
                writer.add_summary(summ, itr)

                # log data
                if self.use_encoder and self.train_enc_next_itr:
                    enc_loss = self.policy.train_acthead(
                        ac_obs, ac_n_obs, b_acts)
                    self.logger.log('encoder', ['loss'], [enc_loss])

                if itr % self.log_rate == 0:
                    self.logger.log('density', ['loss'], [rnd_loss])
                    self.logger.log('policy', ['actor_loss', 'critic_loss'],
                                    [actor_loss, critic_loss])

        self.train_enc_next_itr = False
        # if encoder becomes inaccurate then fine tune next training itr
        if self.use_encoder:
            enc_loss = self.policy.actnn_loss(b_eobs, b_enobs, b_acts)
            if np.mean(enc_loss) > encoder_loss_thresh:
                self.train_enc_next_itr = True
                print('Updating Encoder....')
Esempio n. 21
0
    model = MLPBase(args.num_obs, args.num_actions, args.hidden_dim)
elif args.model == "d2rl":
    model = D2RLNet(args.num_obs, args.num_actions, args.hidden_dim,
                    args.num_layers)
else:
    raise ValueError('Model Not Supported')
optim = torch.optim.Adam(model.parameters(), lr=args.lr)

if args.load_model_dir != None:
    model.load_state_dict(
        torch.load(f'{args.load_model_dir}/model.h5',
                   map_location=torch.device(args.device)))

model.to(device)

reward_normalizer = RunningMeanStd(shape=())
if not args.one_hot:
    obs_normalizer = RunningMeanStd(shape=(args.num_obs, ),
                                    path=args.load_model_dir)
else:
    obs_normalizer = None

# Main loop
i = 0
for i in range(args.num_iterations):
    if i != 0 and i % 10 == 0:
        game_player.reset(
            args, shared_obs,
            shared_legals)  #Attempt at hacky workaround to C memory leak
    # Run num_steps of the game in each worker and accumulate results in
    # the data arrays