def __init__(self, num_steps, num_processes, obs_shape, action_space, recurrent_hidden_state_size, norm_rew=False): self.obs = torch.zeros(num_steps + 1, num_processes, *obs_shape) self.recurrent_hidden_states = torch.zeros( num_steps + 1, num_processes, recurrent_hidden_state_size) self.rewards = torch.zeros(num_steps, num_processes, 1) self.value_preds = torch.zeros(num_steps + 1, num_processes, 1) self.returns = torch.zeros(num_steps + 1, num_processes, 1) self.action_log_probs = torch.zeros(num_steps, num_processes, 1) self.norm_rew = norm_rew if self.norm_rew: self.ret_running_mean_std = RunningMeanStd() if action_space.__class__.__name__ == 'Discrete': action_shape = 1 self.n_actions = action_space.n else: action_shape = action_space.shape[0] self.n_actions = None self.actions = torch.zeros(num_steps, num_processes, action_shape) if action_space.__class__.__name__ == 'Discrete': self.actions = self.actions.long() self.masks = torch.ones(num_steps + 1, num_processes, 1) self.num_steps = num_steps self.step = 0
def __init__(self, env: Any, agent: Any, save_interval: int = 1000, train_episode: int = 10**9, num_eval_episode: int = 3, episode_len: int = 3000, pre_step: int = 10000, gamma: float = 0.995, int_gamma: float = 0.995, lam: float = 0.97, device=torch.device('cpu'), int_coef: float = 1, ext_coef: float = 0.3, eval_interval: int = 10**4, seed: int = 0): self.save_interval = save_interval self.eval_interval = eval_interval # prepare envs self.env = env self.env.seed(seed) self.env_test = deepcopy(env) self.env_test.seed(2**31 - seed) self.agent = agent # pepare steps self.global_step = 0 self.step_in_episode = 0 self.episode_so_far = 0 self.episode_len = episode_len # length of an episode self.num_eval_episode = num_eval_episode self.train_episode = train_episode self.pre_step = pre_step # number of steps used to measure variance of states self.reward_rms = RunningMeanStd() obs_sampled = self.env.reset() self.obs_rms = RunningMeanStd(shape=[1] + list(obs_sampled.shape)) self.device = device self.lam = lam self.gamma = gamma self.int_gamma = int_gamma # gamma for intrinsic reward # ratio of intrinsic and extrinsic rewards self.int_coef = int_coef self.ext_coef = ext_coef self.reward_in_episode = 0.0 self.returns = {'step': [], 'return': []}
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): self.venv = venv self._ob_space = venv.observation_space self._ac_space = venv.action_space self.ob_rms = RunningMeanStd( shape=self._ob_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
def __init__(self, env, policy, rnd, replay_buffer, logger, args): self.env = env # Models self.policy = policy self.rnd = rnd # Utils self.replay_buffer = replay_buffer self.logger = logger self.obs_running_mean = RunningMeanStd((84, 84, 1)) self.rew_running_mean = RunningMeanStd(()) self.last_enc_loss = None self.train_enc_next_itr = False # Args self.use_encoder = args['use_encoder'] self.encoder_train_limit = args['encoder_train_limit'] self.num_random_samples = args['num_random_samples'] self.log_rate = args['log_rate']
def __init__(self, gamma, tau, num_inputs, action_space, replay_size, normalize_obs=True, normalize_returns=False): if torch.cuda.is_available(): self.device = torch.device('cuda') torch.backends.cudnn.enabled = False self.Tensor = torch.cuda.FloatTensor else: self.device = torch.device('cpu') self.Tensor = torch.FloatTensor self.num_inputs = num_inputs self.action_space = action_space self.gamma = gamma self.tau = tau self.normalize_observations = normalize_obs self.normalize_returns = normalize_returns if self.normalize_observations: self.obs_rms = RunningMeanStd(shape=num_inputs) else: self.obs_rms = None if self.normalize_returns: self.ret_rms = RunningMeanStd(shape=1) self.ret = 0 self.cliprew = 10.0 else: self.ret_rms = None self.memory = ReplayMemory(replay_size) self.actor = None self.actor_perturbed = None
class DDPG: def __init__(self, beta, epsilon, learning_rate, gamma, tau, hidden_size_dim0, hidden_size_dim1, num_inputs, action_space, train_mode, alpha, replay_size, optimizer, two_player, normalize_obs=True, normalize_returns=False, critic_l2_reg=1e-2): if torch.cuda.is_available(): self.device = torch.device('cuda') torch.backends.cudnn.enabled = False self.Tensor = torch.cuda.FloatTensor else: self.device = torch.device('cpu') self.Tensor = torch.FloatTensor self.alpha = alpha self.train_mode = train_mode self.num_inputs = num_inputs self.action_space = action_space self.critic_l2_reg = critic_l2_reg self.actor = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.adversary = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) if self.train_mode: self.actor_target = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.actor_bar = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.actor_outer = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) if(optimizer == 'SGLD'): self.actor_optim = SGLD(self.actor.parameters(), lr=1e-4, noise=epsilon, alpha=0.999) elif(optimizer == 'RMSprop'): self.actor_optim = RMSprop(self.actor.parameters(), lr=1e-4, alpha=0.999) else: self.actor_optim = ExtraAdam(self.actor.parameters(), lr=1e-4) self.critic = Critic(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.critic_target = Critic(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3, weight_decay=critic_l2_reg) self.adversary_target = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.adversary_bar = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.adversary_outer = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) if(optimizer == 'SGLD'): self.adversary_optim = SGLD(self.adversary.parameters(), lr=1e-4, noise=epsilon, alpha=0.999) elif(optimizer == 'RMSprop'): self.adversary_optim = RMSprop(self.adversary.parameters(), lr=1e-4, alpha=0.999) else: self.adversary_optim = ExtraAdam(self.adversary.parameters(), lr=1e-4) hard_update(self.adversary_target, self.adversary) # Make sure target is with the same weight hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.gamma = gamma self.tau = tau self.beta = beta self.epsilon = epsilon self.learning_rate = learning_rate self.normalize_observations = normalize_obs self.normalize_returns = normalize_returns self.optimizer = optimizer self.two_player = two_player if self.normalize_observations: self.obs_rms = RunningMeanStd(shape=num_inputs) else: self.obs_rms = None if self.normalize_returns: self.ret_rms = RunningMeanStd(shape=1) self.ret = 0 self.cliprew = 10.0 else: self.ret_rms = None self.memory = ReplayMemory(replay_size) def eval(self): self.actor.eval() self.adversary.eval() if self.train_mode: self.critic.eval() def train(self): self.actor.train() self.adversary.train() if self.train_mode: self.critic.train() def select_action(self, state, action_noise=None, param_noise=None, mdp_type='mdp'): state = normalize(Variable(state).to(self.device), self.obs_rms, self.device) if mdp_type != 'mdp': if(self.optimizer == 'SGLD' and self.two_player): mu = self.actor_outer(state) else: mu = self.actor(state) mu = mu.data if action_noise is not None: mu += self.Tensor(action_noise()).to(self.device) mu = mu.clamp(-1, 1) * (1 - self.alpha) if(self.optimizer == 'SGLD' and self.two_player): adv_mu = self.adversary_outer(state) else: adv_mu = self.adversary(state) adv_mu = adv_mu.data.clamp(-1, 1) * self.alpha mu += adv_mu else: if(self.optimizer == 'SGLD' and self.two_player): mu = self.actor_outer(state) else: mu = self.actor(state) mu = mu.data if action_noise is not None: mu += self.Tensor(action_noise()).to(self.device) mu = mu.clamp(-1, 1) return mu def update_robust_non_flip(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch, mdp_type, robust_update_type): # TRAIN CRITIC if robust_update_type == 'full': next_action_batch = (1 - self.alpha) * self.actor_target(next_state_batch) \ + self.alpha * self.adversary_target(next_state_batch) next_state_action_values = self.critic_target(next_state_batch, next_action_batch) expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values self.critic_optim.zero_grad() state_action_batch = self.critic(state_batch, action_batch) value_loss = F.mse_loss(state_action_batch, expected_state_action_batch) value_loss.backward() self.critic_optim.step() value_loss = value_loss.item() else: value_loss = 0 # TRAIN ADVERSARY self.adversary_optim.zero_grad() with torch.no_grad(): if(self.optimizer == 'SGLD' and self.two_player): real_action = self.actor_outer(next_state_batch) else: real_action = self.actor_target(next_state_batch) action = (1 - self.alpha) * real_action + self.alpha * self.adversary(next_state_batch) adversary_loss = self.critic(state_batch, action) adversary_loss = adversary_loss.mean() adversary_loss.backward() self.adversary_optim.step() adversary_loss = adversary_loss.item() # TRAIN ACTOR self.actor_optim.zero_grad() with torch.no_grad(): if(self.optimizer == 'SGLD' and self.two_player): adversary_action = self.adversary_outer(next_state_batch) else: adversary_action = self.adversary_target(next_state_batch) action = (1 - self.alpha) * self.actor(next_state_batch) + self.alpha * adversary_action policy_loss = -self.critic(state_batch, action) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() policy_loss = policy_loss.item() return value_loss, policy_loss, adversary_loss def update_robust_flip(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch, adversary_update, mdp_type, robust_update_type): # TRAIN CRITIC if robust_update_type == 'full': next_action_batch = (1 - self.alpha) * self.actor_target(next_state_batch) \ + self.alpha * self.adversary_target(next_state_batch) next_state_action_values = self.critic_target(next_state_batch, next_action_batch) expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values self.critic_optim.zero_grad() state_action_batch = self.critic(state_batch, action_batch) value_loss = F.mse_loss(state_action_batch, expected_state_action_batch) value_loss.backward() self.critic_optim.step() value_loss = value_loss.item() else: value_loss = 0 if adversary_update: # TRAIN ADVERSARY self.adversary_optim.zero_grad() with torch.no_grad(): real_action = self.actor_target(next_state_batch) action = (1 - self.alpha) * real_action + self.alpha * self.adversary(next_state_batch) adversary_loss = self.critic(state_batch, action) adversary_loss = adversary_loss.mean() adversary_loss.backward() self.adversary_optim.step() adversary_loss = adversary_loss.item() policy_loss = 0 else: # TRAIN ACTOR self.actor_optim.zero_grad() with torch.no_grad(): adversary_action = self.adversary_target(next_state_batch) action = (1 - self.alpha) * self.actor(next_state_batch) + self.alpha * adversary_action policy_loss = -self.critic(state_batch, action) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() policy_loss = policy_loss.item() adversary_loss = 0 return value_loss, policy_loss, adversary_loss def update_non_robust(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch): # TRAIN CRITIC next_action_batch = self.actor_target(next_state_batch) next_state_action_values = self.critic_target(next_state_batch, next_action_batch) expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values self.critic_optim.zero_grad() state_action_batch = self.critic(state_batch, action_batch) value_loss = F.mse_loss(state_action_batch, expected_state_action_batch) value_loss.backward() self.critic_optim.step() # TRAIN ACTOR self.actor_optim.zero_grad() action = self.actor(next_state_batch) policy_loss = -self.critic(state_batch, action) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() policy_loss = policy_loss.item() adversary_loss = 0 return value_loss.item(), policy_loss, adversary_loss def store_transition(self, state, action, mask, next_state, reward): B = state.shape[0] for b in range(B): self.memory.push(state[b], action[b], mask[b], next_state[b], reward[b]) if self.normalize_observations: self.obs_rms.update(state[b].cpu().numpy()) if self.normalize_returns: self.ret = self.ret * self.gamma + reward[b] self.ret_rms.update(np.array([self.ret])) if mask[b] == 0: # if terminal is True self.ret = 0 def update_parameters(self, batch_size, sgld_outer_update, mdp_type='mdp', exploration_method='mdp'): transitions = self.memory.sample(batch_size) batch = Transition(*zip(*transitions)) if mdp_type != 'mdp': robust_update_type = 'full' elif exploration_method != 'mdp': robust_update_type = 'adversary' else: robust_update_type = None state_batch = normalize(Variable(torch.stack(batch.state)).to(self.device), self.obs_rms, self.device) action_batch = Variable(torch.stack(batch.action)).to(self.device) reward_batch = normalize(Variable(torch.stack(batch.reward)).to(self.device).unsqueeze(1), self.ret_rms, self.device) mask_batch = Variable(torch.stack(batch.mask)).to(self.device).unsqueeze(1) next_state_batch = normalize(Variable(torch.stack(batch.next_state)).to(self.device), self.obs_rms, self.device) if self.normalize_returns: reward_batch = torch.clamp(reward_batch, -self.cliprew, self.cliprew) value_loss = 0 policy_loss = 0 adversary_loss = 0 if robust_update_type is not None: _value_loss, _policy_loss, _adversary_loss = self.update_robust_non_flip(state_batch, action_batch, reward_batch, mask_batch, next_state_batch, mdp_type, robust_update_type) value_loss += _value_loss policy_loss += _policy_loss adversary_loss += _adversary_loss if robust_update_type != 'full': _value_loss, _policy_loss, _adversary_loss = self.update_non_robust(state_batch, action_batch, reward_batch, mask_batch, next_state_batch) value_loss += _value_loss policy_loss += _policy_loss adversary_loss += _adversary_loss if(self.optimizer == 'SGLD' and self.two_player): self.sgld_inner_update() self.soft_update() if(sgld_outer_update and self.optimizer == 'SGLD' and self.two_player): self.sgld_outer_update() return value_loss, policy_loss, adversary_loss def initialize(self): hard_update(self.actor_bar, self.actor_outer) hard_update(self.adversary_bar, self.adversary_outer) hard_update(self.actor, self.actor_outer) hard_update(self.adversary, self.adversary_outer) def sgld_inner_update(self): #target source sgld_update(self.actor_bar, self.actor, self.beta) sgld_update(self.adversary_bar, self.adversary, self.beta) def sgld_outer_update(self): #target source sgld_update(self.actor_outer, self.actor_bar, self.beta) sgld_update(self.adversary_outer, self.adversary_bar, self.beta) def soft_update(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.adversary_target, self.adversary, self.tau) soft_update(self.critic_target, self.critic, self.tau)
def __init__(self, action_dim, state_dim, buffer_size=1000000, action_samples=10, mode='linear', beta=1, tau=5e-3, q_normalization=0.01, gamma=0.99, normalize_obs=False, normalize_rewards=False, batch_size=64, actor='AIQN', *args, **kwargs): """ Agent class to generate a stochastic policy. Args: action_dim (int): action dimension state_dim (int): state dimension buffer_size (int): how much memory is allocated to the ReplayMemoryClass action_samples (int): originally labelled K in the paper, represents how many actions should be sampled from the memory buffer mode (string): poorly named variable to represent variable being used in the distribution being used beta (float): value used in boltzmann distribution tau (float): update rate parameter batch_size (int): batch size q_normalization (float): q value normalization rate gamma (float): value used in critic training normalize_obs (boolean): boolean to indicate that you want to normalize observations normalize_rewards (boolean): boolean to indicate that you want to normalize return values (usually done for numerical stability) actor (string): string indicating the type of actor to use """ self.action_dim = action_dim self.state_dim = state_dim self.buffer_size = buffer_size self.gamma = gamma self.action_samples = action_samples self.mode = mode self.beta = beta self.tau = tau self.batch_size = batch_size self.step = 0 # normalization self.normalize_observations = normalize_obs self.q_normalization = q_normalization self.normalize_rewards = normalize_rewards # Actor # type of actor being used if actor == 'IQN': self.actor = StochasticActor(self.state_dim, self.action_dim, 'source') self.target_actor = StochasticActor(self.state_dim, self.action_dim, 'target') elif actor == 'AIQN': self.actor = AutoRegressiveStochasticActor(self.state_dim, self.action_dim) self.target_actor = AutoRegressiveStochasticActor( self.state_dim, self.action_dim) if self.normalize_observations: self.obs_rms = RunningMeanStd(shape=self.state_dim) else: self.obs_rms = None if self.normalize_rewards: self.ret_rms = RunningMeanStd(shape=1) self.ret = 0 else: self.ret_rms = None # initialize trainable variables self.actor(tf.zeros([self.batch_size, self.state_dim]), tf.zeros([self.batch_size, self.action_dim])) self.target_actor(tf.zeros([self.batch_size, self.state_dim]), tf.zeros([self.batch_size, self.action_dim])) # Critic self.critics = Critic(self.state_dim, self.action_dim, 'source') self.target_critics = Critic(self.state_dim, self.action_dim, 'target') # initialize trainable variables for critics self.critics(tf.zeros([self.batch_size, self.state_dim]), tf.zeros([self.batch_size, self.action_dim])) self.target_critics(tf.zeros([self.batch_size, self.state_dim]), tf.zeros([self.batch_size, self.action_dim])) # Value self.value = Value(self.state_dim, 'source') self.target_value = Value(self.state_dim, 'target') # initialize value training variables self.value(tf.zeros([self.batch_size, self.state_dim])) self.value(tf.zeros([self.batch_size, self.state_dim])) # initialize the target networks. update(self.target_actor, self.actor, 1.0) update(self.target_critics, self.critics, 1.0) update(self.target_value, self.value, 1.0) self.replay = ReplayBuffer(self.state_dim, self.action_dim, self.buffer_size) self.action_sampler = ActionSampler(self.actor.action_dim)
class GACAgent: """ GAC agent. Action is always from -1 to 1 in each dimension. """ def __init__(self, action_dim, state_dim, buffer_size=1000000, action_samples=10, mode='linear', beta=1, tau=5e-3, q_normalization=0.01, gamma=0.99, normalize_obs=False, normalize_rewards=False, batch_size=64, actor='AIQN', *args, **kwargs): """ Agent class to generate a stochastic policy. Args: action_dim (int): action dimension state_dim (int): state dimension buffer_size (int): how much memory is allocated to the ReplayMemoryClass action_samples (int): originally labelled K in the paper, represents how many actions should be sampled from the memory buffer mode (string): poorly named variable to represent variable being used in the distribution being used beta (float): value used in boltzmann distribution tau (float): update rate parameter batch_size (int): batch size q_normalization (float): q value normalization rate gamma (float): value used in critic training normalize_obs (boolean): boolean to indicate that you want to normalize observations normalize_rewards (boolean): boolean to indicate that you want to normalize return values (usually done for numerical stability) actor (string): string indicating the type of actor to use """ self.action_dim = action_dim self.state_dim = state_dim self.buffer_size = buffer_size self.gamma = gamma self.action_samples = action_samples self.mode = mode self.beta = beta self.tau = tau self.batch_size = batch_size self.step = 0 # normalization self.normalize_observations = normalize_obs self.q_normalization = q_normalization self.normalize_rewards = normalize_rewards # Actor # type of actor being used if actor == 'IQN': self.actor = StochasticActor(self.state_dim, self.action_dim, 'source') self.target_actor = StochasticActor(self.state_dim, self.action_dim, 'target') elif actor == 'AIQN': self.actor = AutoRegressiveStochasticActor(self.state_dim, self.action_dim) self.target_actor = AutoRegressiveStochasticActor( self.state_dim, self.action_dim) if self.normalize_observations: self.obs_rms = RunningMeanStd(shape=self.state_dim) else: self.obs_rms = None if self.normalize_rewards: self.ret_rms = RunningMeanStd(shape=1) self.ret = 0 else: self.ret_rms = None # initialize trainable variables self.actor(tf.zeros([self.batch_size, self.state_dim]), tf.zeros([self.batch_size, self.action_dim])) self.target_actor(tf.zeros([self.batch_size, self.state_dim]), tf.zeros([self.batch_size, self.action_dim])) # Critic self.critics = Critic(self.state_dim, self.action_dim, 'source') self.target_critics = Critic(self.state_dim, self.action_dim, 'target') # initialize trainable variables for critics self.critics(tf.zeros([self.batch_size, self.state_dim]), tf.zeros([self.batch_size, self.action_dim])) self.target_critics(tf.zeros([self.batch_size, self.state_dim]), tf.zeros([self.batch_size, self.action_dim])) # Value self.value = Value(self.state_dim, 'source') self.target_value = Value(self.state_dim, 'target') # initialize value training variables self.value(tf.zeros([self.batch_size, self.state_dim])) self.value(tf.zeros([self.batch_size, self.state_dim])) # initialize the target networks. update(self.target_actor, self.actor, 1.0) update(self.target_critics, self.critics, 1.0) update(self.target_value, self.value, 1.0) self.replay = ReplayBuffer(self.state_dim, self.action_dim, self.buffer_size) self.action_sampler = ActionSampler(self.actor.action_dim) def train_one_step(self): """ Execute one update for each of the networks. Note that if no positive advantage elements are returned the algorithm doesn't update the actor parameters. Args: None Returns: None """ # transitions is sampled from replay buffer transitions = self.replay.sample_batch(self.batch_size) state_batch = normalize(transitions.s, self.obs_rms) action_batch = transitions.a reward_batch = normalize(transitions.r, self.ret_rms) next_state_batch = normalize(transitions.sp, self.obs_rms) terminal_mask = transitions.it # transitions is sampled from replay buffer # train critic and value self.critics.train(state_batch, action_batch, reward_batch, next_state_batch, terminal_mask, self.target_value, self.gamma, self.q_normalization) self.value.train(state_batch, self.target_actor, self.target_critics, self.action_samples) # note that transitions.s represents the sampled states from the memory buffer states, actions, advantages = self._sample_positive_advantage_actions( state_batch) if advantages.shape[0]: self.actor.train(states, actions, advantages, self.mode, self.beta) update(self.target_actor, self.actor, self.tau) update(self.target_critics, self.critics, self.tau) update(self.target_value, self.value, self.tau) with self.actor.train_summary_writer.as_default(): tf.summary.scalar('actor loss', self.actor.train_loss.result(), step=self.step) with self.critics.train_summary_writer.as_default(): tf.summary.scalar('critic loss', self.critics.train_loss.result(), step=self.step) with self.value.train_summary_writer.as_default(): tf.summary.scalar('value loss', self.value.train_loss.result(), step=self.step) self.step += 1 def _sample_positive_advantage_actions(self, states): """ Sample from the target network and a uniform distribution. Then only keep the actions with positive advantage. Returning one action per state, if more needed, make states contain the same state multiple times. Args: states (tf.Variable): states of dimension (batch_size, state_dim) Returns: good_states (list): Set of positive advantage states (batch_size, sate_dim) good_actions (list): Set of positive advantage actions advantages (list[float]): set of positive advantage values (Q - V) """ # tile states to be of dimension (batch_size * K, state_dim) tiled_states = tf.tile(states, [self.action_samples, 1]) # Sample actions with noise for regularization target_actions = self.action_sampler.get_actions( self.target_actor, tiled_states) target_actions += tf.random.normal(target_actions.shape) * 0.01 target_actions = tf.clip_by_value(target_actions, -1, 1) target_q = self.target_critics(tiled_states, target_actions) # Sample multiple actions both from the target policy and from a uniform distribution # over the action space. These will be used to determine the target distribution random_actions = tf.random.uniform(target_actions.shape, minval=-1.0, maxval=1.0) random_q = self.target_critics(tiled_states, random_actions) # create target actions vector, consistent of purely random actions and noisy actions # for the sake of exploration target_actions = tf.concat([target_actions, random_actions], 0) # compute Q and V values with dimensions (2 * batch_size * K, 1) q = tf.concat([target_q, random_q], 0) # determine the estimated value of a given state v = self.target_value(tiled_states) v = tf.concat([v, v], 0) # expand tiled states to allow for indexing later on tiled_states = tf.concat([tiled_states, tiled_states], 0) # remove unused dimensions q_squeezed = tf.squeeze(q) v_squeezed = tf.squeeze(v) # select s, a with positive advantage squeezed_indicies = tf.where(q_squeezed > v_squeezed) # collect all advantegeous states and actions good_states = tf.gather_nd(tiled_states, squeezed_indicies) good_actions = tf.gather_nd(target_actions, squeezed_indicies) # retrieve advantage values advantages = tf.gather_nd(q - v, squeezed_indicies) return good_states, good_actions, advantages def get_action(self, states): """ Get a set of actions for a batch of states Args: states (tf.Variable): dimensions (batch_size, state_dim) Returns: sampled actions for the given state with dimension (batch_size, action_dim) """ return self.action_sampler.get_actions(self.actor, states) def select_perturbed_action(self, state, action_noise=None): """ Select actions from the perturbed actor using action noise and parameter noise Args: state (tf.Variable): tf variable containing the state vector action_niose (function): action noise function which will construct noise from some distribution Returns: action vector of dimension (batch_size, action_dim). Note that if action noise, this function is the same as get_action. """ state = normalize(tf.Variable(state, dtype=tf.float32), self.obs_rms) action = self.action_sampler.get_actions(self.actor, state) if action_noise is not None: action += tf.Variable(action_noise(), dtype=tf.float32) action = tf.clip_by_value(action, -1, 1) return action def store_transition(self, state, action, reward, next_state, is_done): """ Store the transition in the replay buffer with normalizing, should it be specified. Args: state (tf.Variable): (batch_size, state_size) state vector action (tf.Variable): (batch_size, action_size) action vector reward (float): reward value determined by the environment (batch_size, 1) next_state (tf.Variable): (batch_size, state_size) next state vector is_done (boolean): value to indicate that the state is terminal """ self.replay.store(state, action, reward, next_state, is_done) if self.normalize_observations: self.obs_rms.update(state) if self.normalize_rewards: self.ret = self.ret * self.gamma + reward self.ret_rms.update(np.array([self.ret])) if is_done: self.ret = 0
def main(): if 'NAME' in os.environ.keys(): NAME = os.environ['NAME'] else: raise ValueError('set NAME via env variable') try: env_settings = json.load(open(default_config['CarIntersectConfigPath'], 'r')) except: env_settings = yaml.load(open(default_config['CarIntersectConfigPath'], 'r')) if 'home-test' not in NAME: wandb.init( project='CarRacing_RND', reinit=True, name=f'rnd_{NAME}', config={'env_config': env_settings, 'agent_config': default_config}, ) # print({section: dict(config[section]) for section in config.sections()}) train_method = default_config['TrainMethod'] env_id = default_config['EnvID'] # env_type = default_config['EnvType'] # if env_type == 'mario': # env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) # elif env_type == 'atari': # env = gym.make(env_id) # else: # raise NotImplementedError seed = np.random.randint(0, 2 ** 16 - 1) print(f'use name : {NAME}') print(f"use env config : {default_config['CarIntersectConfigPath']}") print(f'use seed : {seed}') print(f"use device : {os.environ['DEVICE']}") os.chdir('..') env = makeCarIntersect(env_settings) eval_env = create_eval_env(makeCarIntersect(env_settings)) # input_size = env.observation_space.shape # 4 input_size = env.observation_space.shape assert isinstance(env.action_space, gym.spaces.Box) action_size = env.action_space.shape[0] # 2 env.close() is_load_model = True is_render = False # model_path = 'models/{}.model'.format(NAME) # predictor_path = 'models/{}.pred'.format(NAME) # target_path = 'models/{}.target'.format(NAME) # writer = SummaryWriter() use_cuda = default_config.getboolean('UseGPU') use_gae = default_config.getboolean('UseGAE') use_noisy_net = default_config.getboolean('UseNoisyNet') lam = float(default_config['Lambda']) num_worker = int(default_config['NumEnv']) num_step = int(default_config['NumStep']) ppo_eps = float(default_config['PPOEps']) epoch = int(default_config['Epoch']) mini_batch = int(default_config['MiniBatch']) batch_size = int(num_step * num_worker / mini_batch) learning_rate = float(default_config['LearningRate']) entropy_coef = float(default_config['Entropy']) gamma = float(default_config['Gamma']) int_gamma = float(default_config['IntGamma']) clip_grad_norm = float(default_config['ClipGradNorm']) ext_coef = float(default_config['ExtCoef']) int_coef = float(default_config['IntCoef']) sticky_action = default_config.getboolean('StickyAction') action_prob = float(default_config['ActionProb']) life_done = default_config.getboolean('LifeDone') reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) pre_obs_norm_step = int(default_config['ObsNormStep']) discounted_reward = RewardForwardFilter(int_gamma) agent = RNDAgent( input_size, action_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net, device=os.environ['DEVICE'], ) # if is_load_model: # print('load model...') # if use_cuda: # agent.model.load_state_dict(torch.load(model_path)) # agent.rnd.predictor.load_state_dict(torch.load(predictor_path)) # agent.rnd.target.load_state_dict(torch.load(target_path)) # else: # agent.model.load_state_dict(torch.load(model_path, map_location='cpu')) # agent.rnd.predictor.load_state_dict(torch.load(predictor_path, map_location='cpu')) # agent.rnd.target.load_state_dict(torch.load(target_path, map_location='cpu')) # print('load finished!') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done, settings=env_settings) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) os.chdir('rnd_continues') states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 logger = Logger(None, use_console=True, use_wandb=True, log_interval=1) print('Test evaluater:') evaluate_and_log( eval_env=eval_env, action_get_method=lambda eval_state: agent.get_action( np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255. )[0][0].cpu().numpy(), logger=logger, log_animation=False, exp_class='RND', exp_name=NAME, debug=True, ) print('end evaluater test.') # normalize obs print('Start to initailize observation normalization parameter.....') # print('ALERT! pass section') # assert 'home-test' in NAME next_obs = [] for step in range(num_step * pre_obs_norm_step): actions = np.random.uniform(-1, 1, size=(num_worker, action_size)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_obs.append(s[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (num_step * num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] print('End to initalize...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy_log_prob, total_policy_log_prob_np = \ [], [], [], [], [], [], [], [], [], [], [] # Step 1. n-step rollout for _ in range(num_step): global_step += num_worker # actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.) actions, value_ext, value_int, policy_log_prob = agent.get_action(np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action.cpu().numpy()) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_obs.append(s[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward( ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions.cpu().numpy()) total_ext_values.append(value_ext) total_int_values.append(value_int) # total_policy.append(policy) # total_policy_np.append(policy.cpu().numpy()) total_policy_log_prob.extend(policy_log_prob.cpu().numpy()) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 # writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) # writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) # writer.add_scalar('data/step', sample_step, sample_episode) logger.log_it({ 'reward_per_episode': sample_rall, 'intrinsic_reward': sample_i_rall, 'episode_steps': sample_step, 'global_step_cnt': global_step, 'updates_cnt': global_update, }) logger.publish_logs(step=global_step) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) # total_action = np.stack(total_action).transpose().reshape([-1, action_size]) total_action = np.array(total_action).reshape((-1, action_size)) # total_log_prob_old = np.array(total_policy_log_prob).reshape((-1)) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() # total_logging_policy = np.vstack(total_policy_np) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) # writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) # writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability # writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- global_update += 1 # Step 5. Training! agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_policy_log_prob) # if global_step % (num_worker * num_step * 100) == 0: # print('Now Global Step :{}'.format(global_step)) # torch.save(agent.model.state_dict(), model_path) # torch.save(agent.rnd.predictor.state_dict(), predictor_path) # torch.save(agent.rnd.target.state_dict(), target_path) if global_update % 100 == 0: evaluate_and_log( eval_env=eval_env, action_get_method=lambda eval_state: agent.get_action( np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255. )[0][0].cpu().numpy(), logger=logger, log_animation=True, exp_class='RND', exp_name=NAME, ) logger.publish_logs(step=global_step)
class VecEnvNorm(BaseVecEnv): def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): self.venv = venv self._ob_space = venv.observation_space self._ac_space = venv.action_space self.ob_rms = RunningMeanStd( shape=self._ob_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step(self, vac): obs, rews, news, infos = self.venv.step(vac) self.ret = self.ret * self.gamma + rews # normalize observations obs = self._norm_ob(obs) # normalize rewards if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, rews, news, infos def _norm_ob(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): obs = self.venv.reset() return self._norm_ob(obs) def set_random_seed(self, seeds): for env, seed in zip(self.venv.envs, seeds): env.seed(int(seed)) @property def action_space(self): return self._ac_space @property def observation_space(self): return self._ob_space def close(self): self.venv.close() def render(self): self.venv.render() @property def num_envs(self): return self.venv.num_envs
class RolloutStorage(object): def __init__(self, num_steps, num_processes, obs_shape, action_space, recurrent_hidden_state_size, norm_rew=False): self.obs = torch.zeros(num_steps + 1, num_processes, *obs_shape) self.recurrent_hidden_states = torch.zeros( num_steps + 1, num_processes, recurrent_hidden_state_size) self.rewards = torch.zeros(num_steps, num_processes, 1) self.value_preds = torch.zeros(num_steps + 1, num_processes, 1) self.returns = torch.zeros(num_steps + 1, num_processes, 1) self.action_log_probs = torch.zeros(num_steps, num_processes, 1) self.norm_rew = norm_rew if self.norm_rew: self.ret_running_mean_std = RunningMeanStd() if action_space.__class__.__name__ == 'Discrete': action_shape = 1 self.n_actions = action_space.n else: action_shape = action_space.shape[0] self.n_actions = None self.actions = torch.zeros(num_steps, num_processes, action_shape) if action_space.__class__.__name__ == 'Discrete': self.actions = self.actions.long() self.masks = torch.ones(num_steps + 1, num_processes, 1) self.num_steps = num_steps self.step = 0 def to(self, device): self.obs = self.obs.to(device) self.recurrent_hidden_states = self.recurrent_hidden_states.to(device) self.rewards = self.rewards.to(device) self.value_preds = self.value_preds.to(device) self.returns = self.returns.to(device) self.action_log_probs = self.action_log_probs.to(device) self.actions = self.actions.to(device) self.masks = self.masks.to(device) def insert(self, obs, recurrent_hidden_states, actions, action_log_probs, value_preds, rewards, masks): self.obs[self.step + 1].copy_(obs) self.recurrent_hidden_states[self.step + 1].copy_(recurrent_hidden_states) self.actions[self.step].copy_(actions) self.action_log_probs[self.step].copy_(action_log_probs) self.value_preds[self.step].copy_(value_preds) self.rewards[self.step].copy_(rewards) self.masks[self.step + 1].copy_(masks) self.step = (self.step + 1) % self.num_steps def after_update(self): self.obs[0].copy_(self.obs[-1]) self.recurrent_hidden_states[0].copy_(self.recurrent_hidden_states[-1]) self.masks[0].copy_(self.masks[-1]) def compute_returns(self, next_value, use_gae, gamma, tau): if self.norm_rew: # NOTE: Not adding the estimated value after last time step here r_gamma_sum = torch.zeros(self.returns.size()).to( self.returns.device) for step in reversed(range(self.rewards.size(0))): r_gamma_sum[step] = r_gamma_sum[step + 1] * \ gamma * self.masks[step + 1] + self.rewards[step] r_gamma_sum_flat = r_gamma_sum.view(-1) ret_mean = torch.mean(r_gamma_sum_flat).detach() ret_std = torch.std(r_gamma_sum_flat).detach() ret_count = r_gamma_sum_flat.shape[0] self.ret_running_mean_std.update_from_moments( ret_mean, ret_std**2, ret_count) self.rewards /= torch.sqrt(self.ret_running_mean_std.var) if use_gae: self.value_preds[-1] = next_value gae = 0 for step in reversed(range(self.rewards.size(0))): delta = self.rewards[step] + gamma * self.value_preds[ step + 1] * self.masks[step + 1] - self.value_preds[step] gae = delta + gamma * tau * self.masks[step + 1] * gae self.returns[step] = gae + self.value_preds[step] else: self.returns[-1] = next_value for step in reversed(range(self.rewards.size(0))): self.returns[step] = self.returns[step + 1] * \ gamma * self.masks[step + 1] + self.rewards[step] def feed_forward_generator(self, advantages, num_mini_batch): num_steps, num_processes = self.rewards.size()[0:2] batch_size = num_processes * num_steps assert batch_size >= num_mini_batch, ( "PPO requires the number of processes ({}) " "* number of steps ({}) = {} " "to be greater than or equal to the number of PPO mini batches ({})." "".format(num_processes, num_steps, num_processes * num_steps, num_mini_batch)) mini_batch_size = batch_size // num_mini_batch sampler = BatchSampler(SubsetRandomSampler(range(batch_size)), mini_batch_size, drop_last=False) for indices in sampler: obs_batch = self.obs[:-1].view(-1, *self.obs.size()[2:])[indices] recurrent_hidden_states_batch = self.recurrent_hidden_states[:-1].view( -1, self.recurrent_hidden_states.size(-1))[indices] actions_batch = self.actions.view(-1, self.actions.size(-1))[indices] return_batch = self.returns[:-1].view(-1, 1)[indices] masks_batch = self.masks[:-1].view(-1, 1)[indices] old_action_log_probs_batch = self.action_log_probs.view(-1, 1)[indices] adv_targ = advantages.view(-1, 1)[indices] yield obs_batch, recurrent_hidden_states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, adv_targ, None, None def recurrent_generator(self, advantages, num_mini_batch): num_processes = self.rewards.size(1) assert num_processes >= num_mini_batch, ( "PPO requires the number of processes ({}) " "to be greater than or equal to the number of " "PPO mini batches ({}).".format(num_processes, num_mini_batch)) num_envs_per_batch = num_processes // num_mini_batch perm = torch.randperm(num_processes) for start_ind in range(0, num_processes, num_envs_per_batch): obs_batch = [] recurrent_hidden_states_batch = [] actions_batch = [] return_batch = [] masks_batch = [] old_action_log_probs_batch = [] adv_targ = [] for offset in range(num_envs_per_batch): ind = perm[start_ind + offset] obs_batch.append(self.obs[:-1, ind]) recurrent_hidden_states_batch.append( self.recurrent_hidden_states[0:1, ind]) actions_batch.append(self.actions[:, ind]) return_batch.append(self.returns[:-1, ind]) masks_batch.append(self.masks[:-1, ind]) old_action_log_probs_batch.append(self.action_log_probs[:, ind]) adv_targ.append(advantages[:, ind]) T, N = self.num_steps, num_envs_per_batch # These are all tensors of size (T, N, -1) obs_batch = torch.stack(obs_batch, 1) actions_batch = torch.stack(actions_batch, 1) return_batch = torch.stack(return_batch, 1) masks_batch = torch.stack(masks_batch, 1) old_action_log_probs_batch = torch.stack( old_action_log_probs_batch, 1) adv_targ = torch.stack(adv_targ, 1) # States is just a (N, -1) tensor recurrent_hidden_states_batch = torch.stack( recurrent_hidden_states_batch, 1).view(N, -1) # Flatten the (T, N, ...) tensors to (T * N, ...) obs_batch = _flatten_helper(T, N, obs_batch) actions_batch = _flatten_helper(T, N, actions_batch) return_batch = _flatten_helper(T, N, return_batch) masks_batch = _flatten_helper(T, N, masks_batch) old_action_log_probs_batch = _flatten_helper(T, N, \ old_action_log_probs_batch) adv_targ = _flatten_helper(T, N, adv_targ) yield obs_batch, recurrent_hidden_states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, adv_targ, T, N
def __init__(self, gamma, tau, hidden_size, num_inputs, action_space, train_mode, alpha, replay_size, normalize_obs=True, normalize_returns=False, critic_l2_reg=1e-2): if torch.cuda.is_available(): self.device = torch.device('cuda') torch.backends.cudnn.enabled = False self.Tensor = torch.cuda.FloatTensor else: self.device = torch.device('cpu') self.Tensor = torch.FloatTensor self.alpha = alpha self.train_mode = train_mode self.num_inputs = num_inputs self.action_space = action_space self.critic_l2_reg = critic_l2_reg self.actor = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.adversary = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) if self.train_mode: self.actor_target = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.actor_perturbed = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(hidden_size, self.num_inputs, self.action_space).to(self.device) self.critic_target = Critic(hidden_size, self.num_inputs, self.action_space).to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3, weight_decay=critic_l2_reg) self.adversary_target = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.adversary_perturbed = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.adversary_optim = Adam(self.adversary.parameters(), lr=1e-4) hard_update( self.adversary_target, self.adversary) # Make sure target is with the same weight hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.gamma = gamma self.tau = tau self.normalize_observations = normalize_obs self.normalize_returns = normalize_returns if self.normalize_observations: self.obs_rms = RunningMeanStd(shape=num_inputs) else: self.obs_rms = None if self.normalize_returns: self.ret_rms = RunningMeanStd(shape=1) self.ret = 0 self.cliprew = 10.0 else: self.ret_rms = None self.memory = ReplayMemory(replay_size)
class DDPG: def __init__(self, gamma, tau, hidden_size, num_inputs, action_space, train_mode, alpha, replay_size, normalize_obs=True, normalize_returns=False, critic_l2_reg=1e-2): if torch.cuda.is_available(): self.device = torch.device('cuda') torch.backends.cudnn.enabled = False self.Tensor = torch.cuda.FloatTensor else: self.device = torch.device('cpu') self.Tensor = torch.FloatTensor self.alpha = alpha self.train_mode = train_mode self.num_inputs = num_inputs self.action_space = action_space self.critic_l2_reg = critic_l2_reg self.actor = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.adversary = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) if self.train_mode: self.actor_target = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.actor_perturbed = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(hidden_size, self.num_inputs, self.action_space).to(self.device) self.critic_target = Critic(hidden_size, self.num_inputs, self.action_space).to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3, weight_decay=critic_l2_reg) self.adversary_target = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.adversary_perturbed = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.adversary_optim = Adam(self.adversary.parameters(), lr=1e-4) hard_update( self.adversary_target, self.adversary) # Make sure target is with the same weight hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.gamma = gamma self.tau = tau self.normalize_observations = normalize_obs self.normalize_returns = normalize_returns if self.normalize_observations: self.obs_rms = RunningMeanStd(shape=num_inputs) else: self.obs_rms = None if self.normalize_returns: self.ret_rms = RunningMeanStd(shape=1) self.ret = 0 self.cliprew = 10.0 else: self.ret_rms = None self.memory = ReplayMemory(replay_size) def eval(self): self.actor.eval() self.adversary.eval() if self.train_mode: self.critic.eval() def train(self): self.actor.train() self.adversary.train() if self.train_mode: self.critic.train() def select_action(self, state, action_noise=None, param_noise=None, mdp_type='mdp'): state = normalize( Variable(state).to(self.device), self.obs_rms, self.device) if mdp_type != 'mdp': if mdp_type == 'nr_mdp': if param_noise is not None: mu = self.actor_perturbed(state) else: mu = self.actor(state) mu = mu.data if action_noise is not None: mu += self.Tensor(action_noise()).to(self.device) mu = mu.clamp(-1, 1) * (1 - self.alpha) if param_noise is not None: adv_mu = self.adversary_perturbed(state) else: adv_mu = self.adversary(state) adv_mu = adv_mu.data.clamp(-1, 1) * self.alpha mu += adv_mu else: # mdp_type == 'pr_mdp': if np.random.rand() < (1 - self.alpha): if param_noise is not None: mu = self.actor_perturbed(state) else: mu = self.actor(state) mu = mu.data if action_noise is not None: mu += self.Tensor(action_noise()).to(self.device) mu = mu.clamp(-1, 1) else: if param_noise is not None: mu = self.adversary_perturbed(state) else: mu = self.adversary(state) mu = mu.data.clamp(-1, 1) else: if param_noise is not None: mu = self.actor_perturbed(state) else: mu = self.actor(state) mu = mu.data if action_noise is not None: mu += self.Tensor(action_noise()).to(self.device) mu = mu.clamp(-1, 1) return mu def update_robust(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch, adversary_update, mdp_type, robust_update_type): # TRAIN CRITIC if robust_update_type == 'full': if mdp_type == 'nr_mdp': next_action_batch = (1 - self.alpha) * self.actor_target(next_state_batch) \ + self.alpha * self.adversary_target(next_state_batch) next_state_action_values = self.critic_target( next_state_batch, next_action_batch) else: # mdp_type == 'pr_mdp': next_action_actor_batch = self.actor_target(next_state_batch) next_action_adversary_batch = self.adversary_target( next_state_batch) next_state_action_values = self.critic_target(next_state_batch, next_action_actor_batch) * ( 1 - self.alpha) \ + self.critic_target(next_state_batch, next_action_adversary_batch) * self.alpha expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values self.critic_optim.zero_grad() state_action_batch = self.critic(state_batch, action_batch) value_loss = F.mse_loss(state_action_batch, expected_state_action_batch) value_loss.backward() self.critic_optim.step() value_loss = value_loss.item() else: value_loss = 0 if adversary_update: # TRAIN ADVERSARY self.adversary_optim.zero_grad() if mdp_type == 'nr_mdp': with torch.no_grad(): real_action = self.actor_target(next_state_batch) action = ( 1 - self.alpha ) * real_action + self.alpha * self.adversary(next_state_batch) adversary_loss = self.critic(state_batch, action) else: # mdp_type == 'pr_mdp' action = self.adversary(next_state_batch) adversary_loss = self.critic(state_batch, action) * self.alpha adversary_loss = adversary_loss.mean() adversary_loss.backward() self.adversary_optim.step() adversary_loss = adversary_loss.item() policy_loss = 0 else: if robust_update_type == 'full': # TRAIN ACTOR self.actor_optim.zero_grad() if mdp_type == 'nr_mdp': with torch.no_grad(): adversary_action = self.adversary_target( next_state_batch) action = (1 - self.alpha) * self.actor( next_state_batch) + self.alpha * adversary_action policy_loss = -self.critic(state_batch, action) else: # mdp_type == 'pr_mdp': action = self.actor(next_state_batch) policy_loss = -self.critic(state_batch, action) * ( 1 - self.alpha) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() policy_loss = policy_loss.item() adversary_loss = 0 else: policy_loss = 0 adversary_loss = 0 return value_loss, policy_loss, adversary_loss def update_non_robust(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch): # TRAIN CRITIC next_action_batch = self.actor_target(next_state_batch) next_state_action_values = self.critic_target(next_state_batch, next_action_batch) expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values self.critic_optim.zero_grad() state_action_batch = self.critic(state_batch, action_batch) value_loss = F.mse_loss(state_action_batch, expected_state_action_batch) value_loss.backward() self.critic_optim.step() # TRAIN ACTOR self.actor_optim.zero_grad() action = self.actor(next_state_batch) policy_loss = -self.critic(state_batch, action) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() policy_loss = policy_loss.item() adversary_loss = 0 return value_loss.item(), policy_loss, adversary_loss def store_transition(self, state, action, mask, next_state, reward): B = state.shape[0] for b in range(B): self.memory.push(state[b], action[b], mask[b], next_state[b], reward[b]) if self.normalize_observations: self.obs_rms.update(state[b].cpu().numpy()) if self.normalize_returns: self.ret = self.ret * self.gamma + reward[b] self.ret_rms.update(np.array([self.ret])) if mask[b] == 0: # if terminal is True self.ret = 0 def update_parameters(self, batch_size, mdp_type='mdp', adversary_update=False, exploration_method='mdp'): transitions = self.memory.sample(batch_size) batch = Transition(*zip(*transitions)) if mdp_type != 'mdp': robust_update_type = 'full' elif exploration_method != 'mdp': robust_update_type = 'adversary' else: robust_update_type = None state_batch = normalize( Variable(torch.stack(batch.state)).to(self.device), self.obs_rms, self.device) action_batch = Variable(torch.stack(batch.action)).to(self.device) reward_batch = normalize( Variable(torch.stack(batch.reward)).to(self.device).unsqueeze(1), self.ret_rms, self.device) mask_batch = Variable(torch.stack(batch.mask)).to( self.device).unsqueeze(1) next_state_batch = normalize( Variable(torch.stack(batch.next_state)).to(self.device), self.obs_rms, self.device) if self.normalize_returns: reward_batch = torch.clamp(reward_batch, -self.cliprew, self.cliprew) value_loss = 0 policy_loss = 0 adversary_loss = 0 if robust_update_type is not None: _value_loss, _policy_loss, _adversary_loss = self.update_robust( state_batch, action_batch, reward_batch, mask_batch, next_state_batch, adversary_update, mdp_type, robust_update_type) value_loss += _value_loss policy_loss += _policy_loss adversary_loss += _adversary_loss if robust_update_type != 'full': _value_loss, _policy_loss, _adversary_loss = self.update_non_robust( state_batch, action_batch, reward_batch, mask_batch, next_state_batch) value_loss += _value_loss policy_loss += _policy_loss adversary_loss += _adversary_loss self.soft_update() return value_loss, policy_loss, adversary_loss def soft_update(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.adversary_target, self.adversary, self.tau) soft_update(self.critic_target, self.critic, self.tau) def perturb_actor_parameters(self, param_noise): """Apply parameter noise to actor model, for exploration""" hard_update(self.actor_perturbed, self.actor) params = self.actor_perturbed.state_dict() for name in params: if 'ln' in name: pass param = params[name] param += torch.randn(param.shape).to( self.device) * param_noise.current_stddev """Apply parameter noise to adversary model, for exploration""" hard_update(self.adversary_perturbed, self.adversary) params = self.adversary_perturbed.state_dict() for name in params: if 'ln' in name: pass param = params[name] param += torch.randn(param.shape).to( self.device) * param_noise.current_stddev
class Policy: def __init__(self, gamma, tau, num_inputs, action_space, replay_size, normalize_obs=True, normalize_returns=False): if torch.cuda.is_available(): self.device = torch.device('cuda') torch.backends.cudnn.enabled = False self.Tensor = torch.cuda.FloatTensor else: self.device = torch.device('cpu') self.Tensor = torch.FloatTensor self.num_inputs = num_inputs self.action_space = action_space self.gamma = gamma self.tau = tau self.normalize_observations = normalize_obs self.normalize_returns = normalize_returns if self.normalize_observations: self.obs_rms = RunningMeanStd(shape=num_inputs) else: self.obs_rms = None if self.normalize_returns: self.ret_rms = RunningMeanStd(shape=1) self.ret = 0 self.cliprew = 10.0 else: self.ret_rms = None self.memory = ReplayMemory(replay_size) self.actor = None self.actor_perturbed = None def eval(self): raise NotImplementedError def train(self): raise NotImplementedError def select_action(self, state, action_noise=None, param_noise=None): state = normalize( Variable(state).to(self.device), self.obs_rms, self.device) if param_noise is not None: action = self.policy(self.actor_perturbed, state)[0] else: action = self.policy(self.actor, state)[0] action = action.data if action_noise is not None: action += self.Tensor(action_noise()).to(self.device) action = action.clamp(-1, 1) return action def policy(self, actor, state): raise NotImplementedError def store_transition(self, state, action, mask, next_state, reward): B = state.shape[0] for b in range(B): self.memory.push(state[b], action[b], mask[b], next_state[b], reward[b]) if self.normalize_observations: self.obs_rms.update(state[b].cpu().numpy()) if self.normalize_returns: self.ret = self.ret * self.gamma + reward[b] self.ret_rms.update(np.array([self.ret])) if mask[b] == 0: # if terminal is True self.ret = 0 def update_critic(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch): raise NotImplementedError def update_actor(self, state_batch): raise NotImplementedError def update_parameters(self, batch_size): transitions = self.memory.sample(batch_size) batch = Transition(*zip(*transitions)) state_batch = normalize( Variable(torch.stack(batch.state)).to(self.device), self.obs_rms, self.device) action_batch = Variable(torch.stack(batch.action)).to(self.device) reward_batch = normalize( Variable(torch.stack(batch.reward)).to(self.device).unsqueeze(1), self.ret_rms, self.device) mask_batch = Variable(torch.stack(batch.mask)).to( self.device).unsqueeze(1) next_state_batch = normalize( Variable(torch.stack(batch.next_state)).to(self.device), self.obs_rms, self.device) if self.normalize_returns: reward_batch = torch.clamp(reward_batch, -self.cliprew, self.cliprew) value_loss = self.update_critic(state_batch, action_batch, reward_batch, mask_batch, next_state_batch) policy_loss = self.update_actor(state_batch) self.soft_update() return value_loss, policy_loss def soft_update(self): raise NotImplementedError def perturb_actor_parameters(self, param_noise): """Apply parameter noise to actor model, for exploration""" hard_update(self.actor_perturbed, self.actor) params = self.actor_perturbed.state_dict() for name in params: if 'ln' in name: pass param = params[name] param += torch.randn(param.shape).to( self.device) * param_noise.current_stddev def _tile(self, a, dim, n_tile): init_dim = a.size(dim) repeat_idx = [1] * a.dim() repeat_idx[dim] = n_tile a = a.repeat(*(repeat_idx)) order_index = torch.LongTensor( np.concatenate([ init_dim * np.arange(n_tile) + i for i in range(init_dim) ])).to(self.device) return torch.index_select(a, dim, order_index)
def __init__(self, beta, epsilon, learning_rate, gamma, tau, hidden_size_dim0, hidden_size_dim1, num_inputs, action_space, train_mode, alpha, replay_size, optimizer, two_player, normalize_obs=True, normalize_returns=False, critic_l2_reg=1e-2): if torch.cuda.is_available(): self.device = torch.device('cuda') torch.backends.cudnn.enabled = False self.Tensor = torch.cuda.FloatTensor else: self.device = torch.device('cpu') self.Tensor = torch.FloatTensor self.alpha = alpha self.train_mode = train_mode self.num_inputs = num_inputs self.action_space = action_space self.critic_l2_reg = critic_l2_reg self.actor = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.adversary = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) if self.train_mode: self.actor_target = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.actor_bar = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.actor_outer = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) if(optimizer == 'SGLD'): self.actor_optim = SGLD(self.actor.parameters(), lr=1e-4, noise=epsilon, alpha=0.999) elif(optimizer == 'RMSprop'): self.actor_optim = RMSprop(self.actor.parameters(), lr=1e-4, alpha=0.999) else: self.actor_optim = ExtraAdam(self.actor.parameters(), lr=1e-4) self.critic = Critic(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.critic_target = Critic(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3, weight_decay=critic_l2_reg) self.adversary_target = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.adversary_bar = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.adversary_outer = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) if(optimizer == 'SGLD'): self.adversary_optim = SGLD(self.adversary.parameters(), lr=1e-4, noise=epsilon, alpha=0.999) elif(optimizer == 'RMSprop'): self.adversary_optim = RMSprop(self.adversary.parameters(), lr=1e-4, alpha=0.999) else: self.adversary_optim = ExtraAdam(self.adversary.parameters(), lr=1e-4) hard_update(self.adversary_target, self.adversary) # Make sure target is with the same weight hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.gamma = gamma self.tau = tau self.beta = beta self.epsilon = epsilon self.learning_rate = learning_rate self.normalize_observations = normalize_obs self.normalize_returns = normalize_returns self.optimizer = optimizer self.two_player = two_player if self.normalize_observations: self.obs_rms = RunningMeanStd(shape=num_inputs) else: self.obs_rms = None if self.normalize_returns: self.ret_rms = RunningMeanStd(shape=1) self.ret = 0 self.cliprew = 10.0 else: self.ret_rms = None self.memory = ReplayMemory(replay_size)
class Runner: def __init__(self, env: Any, agent: Any, save_interval: int = 1000, train_episode: int = 10**9, num_eval_episode: int = 3, episode_len: int = 3000, pre_step: int = 10000, gamma: float = 0.995, int_gamma: float = 0.995, lam: float = 0.97, device=torch.device('cpu'), int_coef: float = 1, ext_coef: float = 0.3, eval_interval: int = 10**4, seed: int = 0): self.save_interval = save_interval self.eval_interval = eval_interval # prepare envs self.env = env self.env.seed(seed) self.env_test = deepcopy(env) self.env_test.seed(2**31 - seed) self.agent = agent # pepare steps self.global_step = 0 self.step_in_episode = 0 self.episode_so_far = 0 self.episode_len = episode_len # length of an episode self.num_eval_episode = num_eval_episode self.train_episode = train_episode self.pre_step = pre_step # number of steps used to measure variance of states self.reward_rms = RunningMeanStd() obs_sampled = self.env.reset() self.obs_rms = RunningMeanStd(shape=[1] + list(obs_sampled.shape)) self.device = device self.lam = lam self.gamma = gamma self.int_gamma = int_gamma # gamma for intrinsic reward # ratio of intrinsic and extrinsic rewards self.int_coef = int_coef self.ext_coef = ext_coef self.reward_in_episode = 0.0 self.returns = {'step': [], 'return': []} def run_episode(self): total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy = \ [], [], [], [], [], [], [], [], [], [] self.step_in_episode = 0 self.reward_in_episode = 0 obs = self.env.reset() done = False for _ in range(self.episode_len): action, policy, value_ext, value_int = self.agent.get_action(obs) obs_next, reward, done, info = env.step(2 * action) self.reward_in_episode += reward self.global_step += 1 self.step_in_episode += 1 int_reward = agent.calc_intrinsic_reward( (obs_next - self.obs_rms.mean) / np.sqrt(self.obs_rms.var).clip(-5, 5)) total_next_obs.append(obs_next) total_int_reward.append(int_reward) total_state.append(obs) total_reward.append(reward) total_done.append(done) total_action.append(action) total_ext_values.append(value_ext) total_int_values.append(value_int) total_policy.append(policy) obs = obs_next _, _, value_ext, value_int = agent.get_action(obs) total_ext_values.append(value_ext) total_int_values.append(value_int) total_state = np.stack(total_state) # (num_episode, state_shape) total_action = np.stack(total_action) # (num_episode) total_done = np.stack(total_done) # (num_episode, ) total_next_obs = np.stack(total_next_obs) # (num_episode, state_shape) total_int_reward = np.stack(total_int_reward) # normalize intrinsic reward mean, std, count = np.mean(total_reward), np.std(total_reward), len( total_reward) self.reward_rms.update_from_moments(mean, std**2, count) total_int_reward /= self.reward_rms.var ext_target, ext_adv = self.gae(reward=total_reward, done=total_done, value=total_ext_values, gamma=self.gamma, num_step=self.episode_len) int_target, int_adv = self.gae(reward=total_int_reward, done=[0] * self.episode_len, value=total_int_values, gamma=self.int_gamma, num_step=self.episode_len) total_adv = int_adv * self.int_coef + ext_adv * self.ext_coef self.obs_rms.update(total_next_obs) agent.train_model( states=np.float32(total_state), target_ext=ext_target, target_int=int_target, actions=total_action, advs=total_adv, next_states=((total_next_obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var)).clip(-5, 5), log_pi_old=total_policy, # TODO: fix this num_step=self.episode_len) def evaluate(self, steps): """ 複数エピソード環境を動かし,平均収益を記録する. """ returns = [] for _ in range(self.num_eval_episode): state = self.env_test.reset() done = False episode_return = 0.0 step = 0 while (not done): step += 1 action = self.agent.exploit(state) state, reward, done, _ = self.env_test.step(2 * action) episode_return += reward returns.append(episode_return) mean_return = np.mean(returns) self.returns['step'].append(steps) self.returns['return'].append(mean_return) print(f'Num steps: {steps:<6} ' f'Num episode: {self.episode_so_far} ' f'Return: {mean_return:<5.1f} ' f'Time: {self.time}') def plot(self): """ 平均収益のグラフを描画する. """ fig = plt.figure(figsize=(8, 6)) plt.plot(self.returns['step'], self.returns['return']) plt.xlabel('Steps', fontsize=24) plt.ylabel('Return', fontsize=24) plt.tick_params(labelsize=18) plt.title(f'{self.env.unwrapped.spec.id}', fontsize=24) plt.tight_layout() plt.savefig('figure.png') def start(self): self.start_time = time() self.prepare_normalization_coeff() print('Start Training') for episode in range(self.train_episode): self.episode_so_far = episode self.run_episode() if episode % self.eval_interval: self.evaluate(steps=self.global_step) if episode % (self.eval_interval * 10): self.plot() print('Finished') @property def time(self): return str(timedelta(seconds=int(time() - self.start_time))) def prepare_normalization_coeff(self): states = [] for _ in range(self.pre_step): action = self.env.action_space.sample() state, reward, done, info = self.env.step(action) states.append(state) states = np.array(states) self.obs_rms.update(states) def gae(self, reward: Sequence, done: Sequence, value: Sequence, gamma: float, num_step: int): """Returns (discounted_return, advantage)""" adv_tmp = 0 discounted_return = [None] * num_step for t in range(num_step - 1, -1, -1): delta = reward[t] + gamma * value[t + 1] * (1 - done[t]) - value[t] adv_tmp = delta + gamma * self.lam * (1 - done[t]) * adv_tmp discounted_return[t] = adv_tmp + value[t] discounted_return = np.array(discounted_return, dtype='float32') adv = discounted_return - np.array(value[:-1], dtype='float32') return discounted_return, adv
def main(): args = get_args() device = torch.device('cuda' if args.cuda else 'cpu') env = gym.make(args.env_name) input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in args.env_name: output_size -= 1 env.close() is_render = False if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model_path = os.path.join(args.save_dir, args.env_name + '.model') predictor_path = os.path.join(args.save_dir, args.env_name + '.pred') target_path = os.path.join(args.save_dir, args.env_name + '.target') writer = SummaryWriter(log_dir=args.log_dir) reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) discounted_reward = RewardForwardFilter(args.ext_gamma) model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net) rnd = RNDModel(input_size, output_size) model = model.to(device) rnd = rnd.to(device) optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr) if args.load_model: if args.cuda: model.load_state_dict(torch.load(model_path)) else: model.load_state_dict(torch.load(model_path, map_location='cpu')) works = [] parent_conns = [] child_conns = [] for idx in range(args.num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment(args.env_name, is_render, idx, child_conn, sticky_action=args.sticky_action, p=args.sticky_action_prob, max_episode_steps=args.max_episode_steps) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([args.num_worker, 4, 84, 84]) sample_env_index = 0 # Sample Environment index to log sample_episode = 0 sample_rall = 0 sample_step = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize observation print('Initializes observation normalization...') next_obs = [] for step in range(args.num_step * args.pre_obs_norm_steps): actions = np.random.randint(0, output_size, size=(args.num_worker, )) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: next_state, reward, done, realdone, log_reward = parent_conn.recv() next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (args.num_step * args.num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] print('Training...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], [] global_step += (args.num_worker * args.num_step) global_update += 1 # Step 1. n-step rollout for _ in range(args.num_step): actions, value_ext, value_int, action_probs = get_action( model, device, np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: next_state, reward, done, real_done, log_reward = parent_conn.recv( ) next_states.append(next_state) rewards.append(reward) dones.append(done) real_dones.append(real_done) log_rewards.append(log_reward) next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = compute_intrinsic_reward( rnd, device, ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_index] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_action_probs.append(action_probs) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_index] sample_step += 1 if real_dones[sample_env_index]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape( [-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose( [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_action_probs = np.vstack(total_action_probs) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([ discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T ]) mean, std, count = np.mean(total_reward_per_env), np.std( total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std**2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, args.ext_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, args.int_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # add ext adv and int adv total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! train_model(args, device, output_size, model, rnd, optimizer, np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs) if global_step % (args.num_worker * args.num_step * args.save_interval) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(model.state_dict(), model_path) torch.save(rnd.predictor.state_dict(), predictor_path) torch.save(rnd.target.state_dict(), target_path)
def main(): args = get_args() device = torch.device('cuda' if args.cuda else 'cpu') seed = np.random.randint(0, 100) env = ObstacleTowerEnv('../ObstacleTower/obstacletower', worker_id=seed, retro=True, config={'total-floors': 12}, greyscale=True, timeout_wait=300) env._flattener = ActionFlattener([2, 3, 2, 1]) env._action_space = env._flattener.action_space input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 env.close() is_render = False if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model_path = os.path.join(args.save_dir, 'main.model') predictor_path = os.path.join(args.save_dir, 'main.pred') target_path = os.path.join(args.save_dir, 'main.target') writer = SummaryWriter()#log_dir=args.log_dir) discounted_reward = RewardForwardFilter(args.ext_gamma) model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net) rnd = RNDModel(input_size, output_size) model = model.to(device) rnd = rnd.to(device) optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr) if args.load_model: "Loading model..." if args.cuda: model.load_state_dict(torch.load(model_path)) else: model.load_state_dict(torch.load(model_path, map_location='cpu')) works = [] parent_conns = [] child_conns = [] for idx in range(args.num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment( args.env_name, is_render, idx, child_conn, sticky_action=args.sticky_action, p=args.sticky_action_prob, max_episode_steps=args.max_episode_steps) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([args.num_worker, 4, 84, 84]) sample_env_index = 0 # Sample Environment index to log sample_episode = 0 sample_rall = 0 sample_step = 0 sample_i_rall = 0 global_update = 0 global_step = 0 print("Load RMS =", args.load_rms) if args.load_rms: print("Loading RMS values for observation and reward normalization") with open('reward_rms.pkl', 'rb') as f: reward_rms = dill.load(f) with open('obs_rms.pkl', 'rb') as f: obs_rms = dill.load(f) else: reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) # normalize observation print('Initializing observation normalization...') next_obs = [] for step in range(args.num_step * args.pre_obs_norm_steps): actions = np.random.randint(0, output_size, size=(args.num_worker,)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: next_state, reward, done, realdone, log_reward = parent_conn.recv() next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (args.num_step * args.num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] with open('reward_rms.pkl', 'wb') as f: dill.dump(reward_rms, f) with open('obs_rms.pkl', 'wb') as f: dill.dump(obs_rms, f) print('Training...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], [] global_step += (args.num_worker * args.num_step) global_update += 1 # Step 1. n-step rollout for _ in range(args.num_step): actions, value_ext, value_int, action_probs = get_action(model, device, np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: next_state, reward, done, real_done, log_reward = parent_conn.recv() next_states.append(next_state) rewards.append(reward) dones.append(done) real_dones.append(real_done) log_rewards.append(log_reward) next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = compute_intrinsic_reward(rnd, device, ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_index] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_action_probs.append(action_probs) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_index] sample_step += 1 if real_dones[sample_env_index]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_action_probs = np.vstack(total_action_probs) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, args.ext_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, args.int_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # add ext adv and int adv total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! train_model(args, device, output_size, model, rnd, optimizer, np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs) if global_step % (args.num_worker * args.num_step * args.save_interval) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(model.state_dict(), model_path) torch.save(rnd.predictor.state_dict(), predictor_path) torch.save(rnd.target.state_dict(), target_path) """ checkpoint_list = np.array([int(re.search(r"\d+(\.\d+)?", x)[0]) for x in glob.glob(os.path.join('trained_models', args.env_name+'*.model'))]) if len(checkpoint_list) == 0: last_checkpoint = -1 else: last_checkpoint = checkpoint_list.max() next_checkpoint = last_checkpoint + 1 print("Latest Checkpoint is #{}, saving checkpoint is #{}.".format(last_checkpoint, next_checkpoint)) incre_model_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.model') incre_predictor_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.pred') incre_target_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.target') with open(incre_model_path, 'wb') as f: torch.save(model.state_dict(), f) with open(incre_predictor_path, 'wb') as f: torch.save(rnd.predictor.state_dict(), f) with open(incre_target_path, 'wb') as f: torch.save(rnd.target.state_dict(), f) """ if args.terminate and (global_step > args.terminate_steps): with open('reward_rms.pkl', 'wb') as f: dill.dump(reward_rms, f) with open('obs_rms.pkl', 'wb') as f: dill.dump(obs_rms, f) break
def main(): args = parse_arguments() train_method = args.train_method env_id = args.env_id env_type = args.env_type if env_type == 'atari': env = gym.make(env_id) input_size = env.observation_space.shape output_size = env.action_space.n env.close() else: raise NotImplementedError is_load_model = False is_render = False os.makedirs('models', exist_ok=True) model_path = 'models/{}.model'.format(env_id) predictor_path = 'models/{}.pred'.format(env_id) target_path = 'models/{}.target'.format(env_id) results_dir = os.path.join('outputs', args.env_id) os.makedirs(results_dir, exist_ok=True) logger = Logger(results_dir) writer = SummaryWriter(os.path.join(results_dir, 'tensorboard', args.env_id)) use_cuda = args.use_gpu use_gae = args.use_gae use_noisy_net = args.use_noisynet lam = args.lam num_worker = args.num_worker num_step = args.num_step ppo_eps = args.ppo_eps epoch = args.epoch mini_batch = args.minibatch batch_size = int(num_step * num_worker / mini_batch) learning_rate = args.learning_rate entropy_coef = args.entropy gamma = args.gamma int_gamma = args.int_gamma clip_grad_norm = args.clip_grad_norm ext_coef = args.ext_coef int_coef = args.int_coef sticky_action = args.sticky_action action_prob = args.action_prob life_done = args.life_done pre_obs_norm_step = args.obs_norm_step reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) discounted_reward = RewardForwardFilter(int_gamma) if args.train_method == 'RND': agent = RNDAgent else: raise NotImplementedError if args.env_type == 'atari': env_type = AtariEnvironment else: raise NotImplementedError agent = agent( input_size, output_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net ) logger.info('Start to initialize workers') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done, max_step_per_episode=args.max_step_per_episode) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize obs logger.info('Start to initailize observation normalization parameter.....') next_obs = [] for step in range(num_step * pre_obs_norm_step): actions = np.random.randint(0, output_size, size=(num_worker,)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_obs.append(s[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (num_step * num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] logger.info('End to initalize...') pbar = tqdm.tqdm(total=args.total_frames) while True: logger.info('Iteration: {}'.format(global_update)) total_state, total_reward, total_done, total_next_state, \ total_action, total_int_reward, total_next_obs, total_ext_values, \ total_int_values, total_policy, total_policy_np = \ [], [], [], [], [], [], [], [], [], [], [] global_step += (num_worker * num_step) global_update += 1 # Step 1. n-step rollout for _ in range(num_step): actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = \ [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_obs.append(s[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward( ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_policy.append(policy) total_policy_np.append(policy.cpu().numpy()) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 writer.add_scalar('data/returns_vs_frames', sample_rall, global_step) writer.add_scalar('data/lengths_vs_frames', sample_step, global_step) writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_policy = np.vstack(total_policy_np) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # logging Max action probability writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # Step 4. update obs normalize param obs_rms.update(total_next_obs) # Step 5. Training! agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_policy) if args.save_models and global_update % 1000 == 0: torch.save(agent.model.state_dict(), 'models/{}-{}.model'.format(env_id, global_update)) logger.info('Now Global Step :{}'.format(global_step)) torch.save(agent.model.state_dict(), model_path) torch.save(agent.rnd.predictor.state_dict(), predictor_path) torch.save(agent.rnd.target.state_dict(), target_path) pbar.update(num_worker * num_step) if global_step >= args.total_frames: break pbar.close()
class Agent(object): def __init__(self, env, policy, rnd, replay_buffer, logger, args): self.env = env # Models self.policy = policy self.rnd = rnd # Utils self.replay_buffer = replay_buffer self.logger = logger self.obs_running_mean = RunningMeanStd((84, 84, 1)) self.rew_running_mean = RunningMeanStd(()) self.last_enc_loss = None self.train_enc_next_itr = False # Args self.use_encoder = args['use_encoder'] self.encoder_train_limit = args['encoder_train_limit'] self.num_random_samples = args['num_random_samples'] self.log_rate = args['log_rate'] def set_session(self, sess): self.sess = sess self.policy.set_session(sess) self.rnd.set_sess(sess) def batch(self, eo, a, er, ir, en, d, batch_size, shuffle=True): if shuffle: indxs = np.arange(len(eo)) np.random.shuffle(indxs) eo, a, er, ir, en, d = np.array(eo)[indxs], \ np.array(a)[indxs], np.array(er)[indxs], np.array(ir)[indxs], \ np.array(en)[indxs], np.array(d)[indxs] # batch up data batched_dsets = [] for dset in [eo, a, er, ir, en, d]: bdset = [] for i in range(0, len(dset), batch_size): bdset.append(np.array(dset[i:i + batch_size])) batched_dsets.append(np.array(bdset)) return tuple(batched_dsets) # quick copy paste of sample_env def record(self, num_samples): done, i = False, 0 n_lives, ignore = 6, 0 obs_n, act_n, ext_rew_n, int_rew_n, n_obs_n, dones_n = [], [], [], [], [], [] obs = self.env.reset() while not done and i < num_samples: act = self.policy.sample([obs]) n_obs, rew, done, info = self.env.step(act) rnd_obs = ((n_obs - self.obs_running_mean.mean) / np.sqrt(self.obs_running_mean.var)) rnd_obs = np.clip(rnd_obs, -5, 5) int_rew = self.rnd.get_rewards([rnd_obs])[0] if info['ale.lives'] != n_lives: ignore = 18 n_lives -= 1 if not ignore: i += 1 obs_n.append(obs) ext_rew_n.append(rew) n_obs_n.append(n_obs) act_n.append(act) dones_n.append(done) int_rew_n.append(int_rew) if done: obs = self.env.reset() done = True n_lives, ignore = 6, 0 else: ignore -= 1 self.logger.log('env', ['int_rewards', 'ext_rewards'], [int_rew_n, ext_rew_n]) return int_rew_n, ext_rew_n, obs_n def sample_env(self, batch_size, num_samples, shuffle, algorithm='algorithm'): done, i = False, 0 n_lives, ignore = 6, 0 obs_n, act_n, ext_rew_n, int_rew_n, n_obs_n, dones_n = [], [], [], [], [], [] # policy rollout obs = self.env.reset() while not done and i < num_samples: if algorithm == 'algorithm' and ignore < 0: act = self.policy.sample([obs]) else: # algorithm == 'random' act = self.env.action_space.sample() n_obs, rew, done, info = self.env.step(act) # format obs rnd_obs = ((n_obs - self.obs_running_mean.mean) / np.sqrt(self.obs_running_mean.var)) rnd_obs = np.clip(rnd_obs, -5, 5) int_rew = self.rnd.get_rewards([rnd_obs])[0] # dont record when agent dies if info['ale.lives'] != n_lives: ignore = 18 n_lives -= 1 if not ignore: i += 1 self.rew_running_mean.update(np.array([int_rew])) obs_n.append(obs) ext_rew_n.append(rew) n_obs_n.append(n_obs) act_n.append(act) dones_n.append(done) int_rew_n.append(int_rew) if done: obs = self.env.reset() done = True n_lives, ignore = 6, 0 else: ignore -= 1 obs = n_obs # log before normalization self.logger.log('env', ['int_rewards', 'ext_rewards'], [int_rew_n, ext_rew_n]) # normalize int_rew_n = (int_rew_n - self.rew_running_mean.mean) / np.sqrt( self.rew_running_mean.var) ext_rew_n = np.clip(ext_rew_n, -1, 1) self.obs_running_mean.update(np.array(obs_n)) return obs_n, act_n, ext_rew_n, int_rew_n, n_obs_n, dones_n def get_data(self, batch_size, num_samples, itr): if itr < self.num_random_samples: return self.sample_env(batch_size, num_samples, shuffle=True, algorithm='random') return self.sample_env(batch_size, num_samples, shuffle=True) def init_obsmean(self): obs, done = self.env.reset(), False while not done: act = self.env.action_space.sample() obs, _, done, _ = self.env.step(act) self.obs_running_mean.update(obs) def init_encoder(self, batch_size, num_samples, loss_threshold): threshold_met, i = False, 0 losses = [] while not threshold_met and i < self.encoder_train_limit: raw_enc_obs, raw_act_n, raw_ext_rew_n, raw_int_rew, raw_enc_n_obs, raw_dones_n = self.sample_env( batch_size, num_samples, shuffle=True, algorithm='random') for _ in range(4): enc_obs, act_n, _, _, enc_n_obs, _ = self.batch(raw_enc_obs, raw_act_n, raw_ext_rew_n, raw_int_rew, raw_enc_n_obs, raw_dones_n, batch_size, shuffle=True) for b_eobs, b_acts, b_enobs in zip(enc_obs, act_n, enc_n_obs): enc_loss = self.policy.train_acthead( b_eobs, b_enobs, b_acts) losses.append(np.mean(enc_loss)) self.logger.log('encoder', ['loss'], [np.mean(enc_loss)]) i += 1 if np.mean(losses) < loss_threshold: threshold_met = True losses = [] if threshold_met: print('Encoder init threshold was met...') else: print('Encoder init threshold was NOT met...') def train(self, batch_size, num_samples, encoder_loss_thresh, itr, writer): raw_enc_obs, raw_act_n, raw_ext_rew_n, raw_int_rew, raw_enc_n_obs, raw_dones_n = self.get_data( batch_size, num_samples, itr) for _ in range(4): # reshuffle and batch enc_obs, act_n, ext_rew_n, int_rew, enc_n_obs, dones_n = self.batch( raw_enc_obs, raw_act_n, raw_ext_rew_n, raw_int_rew, raw_enc_n_obs, raw_dones_n, batch_size, shuffle=True) for b_eobs, b_acts, b_erew, b_irew, b_enobs, b_dones in zip( enc_obs, act_n, ext_rew_n, int_rew, enc_n_obs, dones_n): # norm and clip for rnd rnd_obs = (b_eobs - self.obs_running_mean.mean ) / self.obs_running_mean.var rnd_obs = np.clip(rnd_obs, -5, 5) rnd_loss = self.rnd.train(rnd_obs) total_r = b_erew + b_irew # norm for policy ac_obs, ac_n_obs = b_eobs / 255., b_enobs / 255. critic_loss = self.policy.train_critic(ac_obs, ac_n_obs, total_r, b_dones) adv = self.policy.estimate_adv(ac_obs, total_r, ac_n_obs, b_dones) actor_loss, summ = self.policy.train_actor(ac_obs, b_acts, adv) writer.add_summary(summ, itr) # log data if self.use_encoder and self.train_enc_next_itr: enc_loss = self.policy.train_acthead( ac_obs, ac_n_obs, b_acts) self.logger.log('encoder', ['loss'], [enc_loss]) if itr % self.log_rate == 0: self.logger.log('density', ['loss'], [rnd_loss]) self.logger.log('policy', ['actor_loss', 'critic_loss'], [actor_loss, critic_loss]) self.train_enc_next_itr = False # if encoder becomes inaccurate then fine tune next training itr if self.use_encoder: enc_loss = self.policy.actnn_loss(b_eobs, b_enobs, b_acts) if np.mean(enc_loss) > encoder_loss_thresh: self.train_enc_next_itr = True print('Updating Encoder....')
model = MLPBase(args.num_obs, args.num_actions, args.hidden_dim) elif args.model == "d2rl": model = D2RLNet(args.num_obs, args.num_actions, args.hidden_dim, args.num_layers) else: raise ValueError('Model Not Supported') optim = torch.optim.Adam(model.parameters(), lr=args.lr) if args.load_model_dir != None: model.load_state_dict( torch.load(f'{args.load_model_dir}/model.h5', map_location=torch.device(args.device))) model.to(device) reward_normalizer = RunningMeanStd(shape=()) if not args.one_hot: obs_normalizer = RunningMeanStd(shape=(args.num_obs, ), path=args.load_model_dir) else: obs_normalizer = None # Main loop i = 0 for i in range(args.num_iterations): if i != 0 and i % 10 == 0: game_player.reset( args, shared_obs, shared_legals) #Attempt at hacky workaround to C memory leak # Run num_steps of the game in each worker and accumulate results in # the data arrays