def _add_log_prob_and_value_to_episodes( episodes, model, phi, batch_states, obs_normalizer, ): dataset = list(itertools.chain.from_iterable(episodes)) xp = model.xp # Compute v_pred and next_v_pred states = batch_states([b['state'] for b in dataset], xp, phi) next_states = batch_states([b['next_state'] for b in dataset], xp, phi) if obs_normalizer: states = obs_normalizer(states, update=False) next_states = obs_normalizer(next_states, update=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): distribs, vs_pred = model(states) _, next_vs_pred = model(next_states) actions = xp.array([b['action'] for b in dataset]) log_probs = chainer.cuda.to_cpu(distribs.log_prob(actions).array) vs_pred = chainer.cuda.to_cpu(vs_pred.array.ravel()) next_vs_pred = chainer.cuda.to_cpu(next_vs_pred.array.ravel()) for transition, log_prob, v_pred, next_v_pred in zip( dataset, log_probs, vs_pred, next_vs_pred): transition['log_prob'] = log_prob transition['v_pred'] = v_pred transition['next_v_pred'] = next_v_pred
def _make_dataset(self): dataset = list(itertools.chain.from_iterable(self.memory)) xp = self.vf.xp # Compute v_pred and next_v_pred states = batch_states([b['state'] for b in dataset], xp, self.phi) next_states = batch_states([b['next_state'] for b in dataset], xp, self.phi) if self.obs_normalizer: states = self.obs_normalizer(states, update=False) next_states = self.obs_normalizer(next_states, update=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): vs_pred = chainer.cuda.to_cpu(self.vf(states).data.ravel()) next_vs_pred = chainer.cuda.to_cpu( self.vf(next_states).data.ravel()) for transition, v_pred, next_v_pred in zip(dataset, vs_pred, next_vs_pred): transition['v_pred'] = v_pred transition['next_v_pred'] = next_v_pred # Update stats self.value_record.extend(vs_pred) # Compute adv and v_teacher for episode in self.memory: adv = 0.0 for transition in reversed(episode): td_err = (transition['reward'] + (self.gamma * transition['nonterminal'] * transition['next_v_pred']) - transition['v_pred']) adv = td_err + self.gamma * self.lambd * adv transition['adv'] = adv transition['v_teacher'] = adv + transition['v_pred'] return dataset
def batch_recurrent_experiences(experiences, model, xp, phi, gamma, batch_states=batch_states): """Batch experiences for recurrent model updates. Args: experiences: list of episodes. Each episode is a list containing between 1 and n dicts, each containing: - state (object): State - action (object): Action - reward (float): Reward - is_state_terminal (bool): True iff next state is terminal - next_state (object): Next state model (chainer.Link): Model that implements StatelessRecurrent. xp : Numpy compatible matrix library: e.g. Numpy or CuPy. phi : Preprocessing function gamma: discount factor batch_states: function that converts a list to a batch Returns: dict of batched transitions """ flat_transitions = list(itertools.chain.from_iterable(experiences)) batch_exp = { 'state': [ batch_states([transition['state'] for transition in ep], xp, phi) for ep in experiences ], 'action': xp.array([transition['action'] for transition in flat_transitions]), 'reward': xp.array([transition['reward'] for transition in flat_transitions], dtype=np.float32), 'next_state': [ batch_states([transition['next_state'] for transition in ep], xp, phi) for ep in experiences ], 'is_state_terminal': xp.array([ transition['is_state_terminal'] for transition in flat_transitions ], dtype=np.float32), 'discount': xp.full(len(flat_transitions), gamma, dtype=np.float32), 'recurrent_state': model.concatenate_recurrent_states( [ep[0]['recurrent_state'] for ep in experiences]), 'next_recurrent_state': model.concatenate_recurrent_states( [ep[0]['next_recurrent_state'] for ep in experiences]), } # Batch next actions only when all the transitions have them if all(transition['next_action'] is not None for transition in flat_transitions): batch_exp['next_action'] = xp.asarray( [transition['next_action'] for transition in flat_transitions]) return batch_exp
def _add_log_prob_and_value_to_episodes_recurrent( episodes, model, phi, batch_states, obs_normalizer, ): xp = model.xp # Prepare data for a recurrent model seqs_states = [] seqs_next_states = [] for ep in episodes: states = batch_states([transition['state'] for transition in ep], xp, phi) next_states = batch_states( [transition['next_state'] for transition in ep], xp, phi) if obs_normalizer: states = obs_normalizer(states, update=False) next_states = obs_normalizer(next_states, update=False) seqs_states.append(states) seqs_next_states.append(next_states) flat_transitions = list(itertools.chain.from_iterable(episodes)) # Predict values using a recurrent model with chainer.using_config('train', False), chainer.no_backprop_mode(): rs = model.concatenate_recurrent_states( [ep[0]['recurrent_state'] for ep in episodes]) next_rs = model.concatenate_recurrent_states( [ep[0]['next_recurrent_state'] for ep in episodes]) assert len(rs) == len(next_rs) (flat_distribs, flat_vs), _ = model.n_step_forward(seqs_states, recurrent_state=rs, output_mode='concat') (_, flat_next_vs), _ = model.n_step_forward(seqs_next_states, recurrent_state=next_rs, output_mode='concat') flat_actions = xp.array([b['action'] for b in flat_transitions]) flat_log_probs = flat_distribs.log_prob(flat_actions) flat_log_probs = chainer.cuda.to_cpu(flat_log_probs.array) flat_vs = chainer.cuda.to_cpu(flat_vs.array) flat_next_vs = chainer.cuda.to_cpu(flat_next_vs.array) # Add predicted values to transitions for transition, log_prob, v, next_v in zip(flat_transitions, flat_log_probs, flat_vs, flat_next_vs): transition['log_prob'] = float(log_prob) transition['v_pred'] = float(v) transition['next_v_pred'] = float(next_v)
def batch_experiences(experiences, xp, phi, gamma, batch_states=batch_states): """Takes a batch of k experiences each of which contains j consecutive transitions and vectorizes them, where j is between 1 and n. Args: experiences: list of experiences. Each experience is a list containing between 1 and n dicts containing - state (object): State - action (object): Action - reward (float): Reward - is_state_terminal (bool): True iff next state is terminal - next_state (object): Next state xp : Numpy compatible matrix library: e.g. Numpy or CuPy. phi : Preprocessing function gamma: discount factor batch_states: function that converts a list to a batch Returns: dict of batched transitions """ batch_exp = { 'state': batch_states([elem[0]['state'] for elem in experiences], xp, phi), 'action': xp.asarray([elem[0]['action'] for elem in experiences]), 'reward': xp.asarray([ sum((gamma**i) * exp[i]['reward'] for i in range(len(exp))) for exp in experiences ], dtype=np.float32), 'next_state': batch_states([elem[-1]['next_state'] for elem in experiences], xp, phi), 'is_state_terminal': xp.asarray([ any(transition['is_state_terminal'] for transition in exp) for exp in experiences ], dtype=np.float32), 'discount': xp.asarray([(gamma**len(elem)) for elem in experiences], dtype=np.float32) } if all(elem[-1]['next_action'] is not None for elem in experiences): batch_exp['next_action'] = xp.asarray( [elem[-1]['next_action'] for elem in experiences]) return batch_exp
def update(self): xp = self.xp if self.standardize_advantages: all_advs = xp.array([b['adv'] for b in self.memory]) mean_advs = xp.mean(all_advs) std_advs = xp.std(all_advs) target_model = copy.deepcopy(self.model) dataset_iter = chainer.iterators.SerialIterator( self.memory, self.minibatch_size) dataset_iter.reset() while dataset_iter.epoch < self.epochs: batch = dataset_iter.__next__() states = batch_states([b['state'] for b in batch], xp, self.phi) actions = xp.array([b['action'] for b in batch]) distribs, vs_pred = self.model(states) with chainer.no_backprop_mode(): target_distribs, _ = target_model(states) advs = xp.array([b['adv'] for b in batch], dtype=xp.float32) if self.standardize_advantages: advs = (advs - mean_advs) / std_advs self.optimizer.update( self._lossfun, distribs, vs_pred, distribs.log_prob(actions), vs_pred_old=xp.array( [b['v_pred'] for b in batch], dtype=xp.float32), target_log_probs=target_distribs.log_prob(actions), advs=advs, vs_teacher=xp.array( [b['v_teacher'] for b in batch], dtype=xp.float32), )
def act_and_train(self, obs, reward): if hasattr(self.model, 'obs_filter'): xp = self.xp b_state = batch_states([obs], xp, self.phi) self.model.obs_filter.experience(b_state) action, v = self._act(obs) # Update stats self.average_v += ((1 - self.average_v_decay) * (v[0] - self.average_v)) if self.last_state is not None: self.last_episode.append({ 'state': self.last_state, 'action': self.last_action, 'reward': reward, 'v_pred': self.last_v, 'next_state': obs, 'next_v_pred': v, 'nonterminal': 1.0 }) self.last_state = obs self.last_action = action self.last_v = v self._train() return action
def batch_experiences(experiences, xp, phi, batch_states=batch_states): return { 'state': batch_states( [elem['state'] for elem in experiences], xp, phi), 'action': xp.asarray([elem['action'] for elem in experiences]), 'reward': xp.asarray( [elem['reward'] for elem in experiences], dtype=np.float32), 'next_state': batch_states( [elem['next_state'] for elem in experiences], xp, phi), 'next_action': xp.asarray( [elem['next_action'] for elem in experiences]), 'is_state_terminal': xp.asarray( [elem['is_state_terminal'] for elem in experiences], dtype=np.float32)}
def sample_from_policy(env, model, obs_normalizer): xp = np phi = lambda x: x states = [] actions = [] obs = env.reset().astype('float32') # Initial state states.append(obs) done = False while not done: b_state = batch_states([obs], xp, phi) b_state = obs_normalizer(b_state, update=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): action_distrib, _ = model(b_state) action = chainer.cuda.to_cpu(action_distrib.sample().array)[0] actions.append(action) new_obs, _, done, _ = env.step(action) obs = new_obs.astype('float32') if not done: states.append(obs) return states, actions
def sample_probs_and_actions_from_policy(env, model, obs_normalizer, initial_state=None): xp = np phi = lambda x: x probs = [] actions = [] obs = env.reset().astype('float32') # Initial state if initial_state is not None: env.state = initial_state obs = np.array(initial_state).astype('float32') done = False while not done: b_state = batch_states([obs], xp, phi) if obs_normalizer: b_state = obs_normalizer(b_state, update=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): action_distrib, _ = model(b_state) action = chainer.cuda.to_cpu(action_distrib.sample().array)[0] probs.append(action_distrib.all_prob.data[0][-1]) actions.append(action) new_obs, _, done, _ = env.step(action) obs = new_obs.astype('float32') return probs, actions
def act_and_train(self, state, reward): xp = self.xp b_state = batch_states([state], xp, self.phi) if self.obs_normalizer: b_state = self.obs_normalizer(b_state, update=False) # action_distrib will be recomputed when computing gradients with chainer.using_config('train', False), chainer.no_backprop_mode(): action_distrib = self.policy(b_state) action = chainer.cuda.to_cpu(action_distrib.sample().data)[0] self.entropy_record.append(float(action_distrib.entropy.data)) self.logger.debug('action_distrib: %s', action_distrib) self.logger.debug('action: %s', action) if self.last_state is not None: self.last_episode.append({ 'state': self.last_state, 'action': self.last_action, 'reward': reward, 'next_state': state, 'nonterminal': 1.0, }) self.last_state = state self.last_action = action self._update_if_dataset_is_ready() return action
def _act(self, state): xp = self.xp with chainer.using_config('train', False): b_state = batch_states([state], xp, self.phi) with chainer.no_backprop_mode(): action_distrib, v = self.model(b_state) action = action_distrib.sample() return cuda.to_cpu(action.data)[0], cuda.to_cpu(v.data)[0]
def act(self, state): xp = self.xp b_state = batch_states([state], xp, self.phi) if self.obs_normalizer: b_state = self.obs_normalizer(b_state, update=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): action_distrib = self.policy(b_state) if self.act_deterministically: action = chainer.cuda.to_cpu( action_distrib.most_probable.data)[0] else: action = chainer.cuda.to_cpu(action_distrib.sample().data)[0] return action
def _update_policy(self, dataset): """Update the policy using a given dataset. The policy is updated via CG and line search. """ assert 'state' in dataset[0] assert 'action' in dataset[0] assert 'adv' in dataset[0] # Use full-batch xp = self.policy.xp states = batch_states([b['state'] for b in dataset], xp, self.phi) if self.obs_normalizer: states = self.obs_normalizer(states, update=False) actions = xp.array([b['action'] for b in dataset]) advs = xp.array([b['adv'] for b in dataset], dtype=np.float32) if self.standardize_advantages: mean_advs = xp.mean(advs) std_advs = xp.std(advs) advs = (advs - mean_advs) / (std_advs + 1e-8) # Recompute action distributions for batch backprop action_distrib = self.policy(states) log_prob_old = xp.array( [transition['log_prob'] for transition in dataset], dtype=np.float32) gain = self._compute_gain( log_prob=action_distrib.log_prob(actions), log_prob_old=log_prob_old, entropy=action_distrib.entropy, advs=advs) # Distribution to compute KL div against action_distrib_old = action_distrib.copy() full_step = self._compute_kl_constrained_step( action_distrib=action_distrib, action_distrib_old=action_distrib_old, gain=gain) self._line_search( full_step=full_step, dataset=dataset, advs=advs, action_distrib_old=action_distrib_old, gain=gain)
def batch_trajectory(trajectory, xp, phi, gamma, batch_states=batch_states): batch_tr = { 'state': batch_states([elem['state'] for elem in trajectory], xp, phi), 'action': np.asarray([elem['action'] for elem in trajectory], dtype=np.int32), 'reward': np.asarray([elem['reward'] for elem in trajectory], dtype=np.float32), 'is_state_terminal': np.asarray([elem['is_state_terminal'] for elem in trajectory], dtype=np.float32), 'embedding': [elem['embedding'] for elem in trajectory] } return batch_tr
def backup_store_if_necessary(self, embedding, t): if self.model.lambdas == 0 or self.model.lambdas == 1: return if (t % self.periodic_steps == 0) and (self.t >= self.replay_buffer.capacity): self.replay_buffer.update_embedding() trajectories = self.replay_buffer.lookup(embedding) batch_trajectory = [{ 'state': batch_states([elem[0]['state'] for elem in traject], self.xp, self.phi), 'action': [elem[0]['action'] for elem in traject], 'reward': [elem[0]['reward'] for elem in traject], 'embedding': [elem[0]['embedding'] for elem in traject] } for traject in trajectories] qnp, embeddings = self._trajectory_centric_planning( batch_trajectory) self.value_buffer.store(embeddings, qnp)
def _update_vf(self, dataset): """Update the value function using a given dataset. The value function is updated via SGD to minimize TD(lambda) errors. """ xp = self.vf.xp assert 'state' in dataset[0] assert 'v_teacher' in dataset[0] dataset_iter = chainer.iterators.SerialIterator( dataset, self.vf_batch_size) while dataset_iter.epoch < self.vf_epochs: batch = dataset_iter.__next__() states = batch_states([b['state'] for b in batch], xp, self.phi) if self.obs_normalizer: states = self.obs_normalizer(states, update=False) vs_teacher = xp.array( [b['v_teacher'] for b in batch], dtype=xp.float32) vs_pred = self.vf(states) vf_loss = F.mean_squared_error(vs_pred, vs_teacher[..., None]) self.vf_optimizer.update(lambda: vf_loss)
def _update_obs_normalizer(self, dataset): assert self.obs_normalizer states = batch_states( [b['state'] for b in dataset], self.obs_normalizer.xp, self.phi) self.obs_normalizer.experience(states)