def save_rnd_images(self, dir_path=None): if dir_path is None: dir_path = os.path.join(self.parent_level_manager.parent_graph_manager.task_parameters.experiment_path, 'rnd_images') else: dir_path = os.path.join(dir_path, 'rnd_images') if not os.path.exists(dir_path): os.mkdir(dir_path) transitions = self.memory.transitions dataset = Batch(transitions) batch_size = self.ap.algorithm.rnd_batch_size novelties = [] for i in range(int(dataset.size / batch_size)): start = i * batch_size end = (i + 1) * batch_size batch = Batch(dataset[start:end]) novelty = self.calculate_novelty(batch) novelties.append(novelty) novelties = np.concatenate(novelties) sorted_indices = np.argsort(novelties) sample_indices = sorted_indices[np.round(np.linspace(0, len(sorted_indices) - 1, 100)).astype(np.uint32)] images = [] for si in sample_indices: images.append(np.flip(transitions[si].next_state[self.ap.algorithm.env_obs_key], 0)) rows = [] for i in range(10): rows.append(np.hstack(images[(i * 10):((i + 1) * 10)])) image = np.vstack(rows) image = Image.fromarray(image) image.save('{}/{}_{}.jpeg'.format(dir_path, 'rnd_samples', len(transitions)))
def train_rnd(self): if self.memory.num_transitions() == 0: return transitions = self.memory.transitions[-self.ap.algorithm.rnd_sample_size:] dataset = Batch(transitions) dataset_order = list(range(dataset.size)) batch_size = self.ap.algorithm.rnd_batch_size for epoch in range(self.ap.algorithm.rnd_optimization_epochs): shuffle(dataset_order) total_loss = 0 total_grads = 0 for i in range(int(dataset.size / batch_size)): start = i * batch_size end = (i + 1) * batch_size batch = Batch(list(np.array(dataset.transitions)[dataset_order[start:end]])) inputs = self.prepare_rnd_inputs(batch) const_embedding = self.networks['constant'].online_network.predict(inputs) res = self.networks['predictor'].train_and_sync_networks(inputs, [const_embedding]) total_loss += res[0] total_grads += res[2] screen.log_dict( OrderedDict([ ("training epoch", epoch), ("dataset size", dataset.size), ("mean loss", total_loss / dataset.size), ("mean gradients", total_grads / dataset.size) ]), prefix="RND Training" )
def improve_reward_model(self, epochs: int): """ Train a reward model to be used by the doubly-robust estimator :param epochs: The total number of epochs to use for training a reward model :return: None """ batch_size = self.ap.network_wrappers['reward_model'].batch_size network_keys = self.ap.network_wrappers['reward_model'].input_embedders_parameters.keys() # this is fitted from the training dataset for epoch in range(epochs): loss = 0 for i, batch in enumerate(self.call_memory('get_shuffled_data_generator', batch_size)): batch = Batch(batch) current_rewards_prediction_for_all_actions = self.networks['reward_model'].online_network.predict(batch.states(network_keys)) current_rewards_prediction_for_all_actions[range(batch_size), batch.actions()] = batch.rewards() loss += self.networks['reward_model'].train_and_sync_networks( batch.states(network_keys), current_rewards_prediction_for_all_actions)[0] # print(self.networks['reward_model'].online_network.predict(batch.states(network_keys))[0]) log = OrderedDict() log['Epoch'] = epoch log['loss'] = loss / int(self.call_memory('num_transitions_in_complete_episodes') / batch_size) screen.log_dict(log, prefix='Training Reward Model')
def train(self): if self._should_train(): for network in self.networks.values(): network.set_is_training(True) dataset = self.memory.transitions dataset = self.pre_network_filter.filter(dataset, deep_copy=False) batch = Batch(dataset) for training_step in range(self.ap.algorithm.num_consecutive_training_steps): self.networks['main'].sync() self.fill_advantages(batch) # take only the requested number of steps if isinstance(self.ap.algorithm.num_consecutive_playing_steps, EnvironmentSteps): dataset = dataset[:self.ap.algorithm.num_consecutive_playing_steps.num_steps] shuffle(dataset) batch = Batch(dataset) self.train_network(batch, self.ap.algorithm.optimization_epochs) for network in self.networks.values(): network.set_is_training(False) self.post_training_commands() self.training_iteration += 1 # should be done in order to update the data that has been accumulated * while not playing * self.update_log() return None
def train_value_network(self, dataset, epochs): loss = [] batch = Batch(dataset) network_keys = self.ap.network_wrappers[ 'critic'].input_embedders_parameters.keys() # * Found not to have any impact * # add a timestep to the observation # current_states_with_timestep = self.concat_state_and_timestep(dataset) mix_fraction = self.ap.algorithm.value_targets_mix_fraction total_returns = batch.n_step_discounted_rewards(True) for j in range(epochs): curr_batch_size = batch.size if self.networks['critic'].online_network.optimizer_type != 'LBFGS': curr_batch_size = self.ap.network_wrappers['critic'].batch_size for i in range(batch.size // curr_batch_size): # split to batches for first order optimization techniques current_states_batch = { k: v[i * curr_batch_size:(i + 1) * curr_batch_size] for k, v in batch.states(network_keys).items() } total_return_batch = total_returns[i * curr_batch_size:(i + 1) * curr_batch_size] old_policy_values = force_list( self.networks['critic'].target_network.predict( current_states_batch).squeeze()) if self.networks[ 'critic'].online_network.optimizer_type != 'LBFGS': targets = total_return_batch else: current_values = self.networks[ 'critic'].online_network.predict(current_states_batch) targets = current_values * ( 1 - mix_fraction) + total_return_batch * mix_fraction inputs = copy.copy(current_states_batch) for input_index, input in enumerate(old_policy_values): name = 'output_0_{}'.format(input_index) if name in self.networks['critic'].online_network.inputs: inputs[name] = input value_loss = self.networks[ 'critic'].online_network.accumulate_gradients( inputs, targets) self.networks['critic'].apply_gradients_to_online_network() if isinstance(self.ap.task_parameters, DistributedTaskParameters): self.networks['critic'].apply_gradients_to_global_network() self.networks[ 'critic'].online_network.reset_accumulated_gradients() loss.append([value_loss[0]]) loss = np.mean(loss, 0) return loss
def get_reward_model_loss(self, batch: Batch): network_keys = self.ap.network_wrappers[ 'reward_model'].input_embedders_parameters.keys() current_rewards_prediction_for_all_actions = self.networks[ 'reward_model'].online_network.predict(batch.states(network_keys)) current_rewards_prediction_for_all_actions[ range(batch.size), batch.actions()] = batch.rewards() return self.networks['reward_model'].train_and_sync_networks( batch.states(network_keys), current_rewards_prediction_for_all_actions)[0]
def gather_static_shared_stats( self, evaluation_dataset_as_transitions: List[Transition], batch_size: int, reward_model: Architecture, network_keys: List) -> None: all_reward_model_rewards = [] all_old_policy_probs = [] all_rewards = [] all_actions = [] for i in range( math.ceil(len(evaluation_dataset_as_transitions) / batch_size)): batch = evaluation_dataset_as_transitions[i * batch_size:(i + 1) * batch_size] batch_for_inference = Batch(batch) all_reward_model_rewards.append( reward_model.predict(batch_for_inference.states(network_keys))) all_rewards.append(batch_for_inference.rewards()) all_actions.append(batch_for_inference.actions()) all_old_policy_probs.append( batch_for_inference.info('all_action_probabilities')[ range(len(batch_for_inference.actions())), batch_for_inference.actions()]) self.all_reward_model_rewards = np.concatenate( all_reward_model_rewards, axis=0) self.all_old_policy_probs = np.concatenate(all_old_policy_probs, axis=0) self.all_rewards = np.concatenate(all_rewards, axis=0) self.all_actions = np.concatenate(all_actions, axis=0) # mark that static shared data was collected and ready to be used self.is_gathered_static_shared_data = True
def train_off_policy(self): loss = 0 # TODO: this should be network dependent! network_parameters = list(self.ap.network_wrappers.values())[0] # update counters self.training_iteration += 1 # sample a batch and train on it batch = self.call_memory('sample', network_parameters.batch_size) if self.pre_network_filter is not None: batch = self.pre_network_filter.filter(batch, update_internal_state=False, deep_copy=False) # if the batch returned empty then there are not enough samples in the replay buffer -> skip # training step if len(batch) > 0: # train batch = Batch(batch) total_loss, losses, unclipped_grads = self.learn_from_batch_off_policy( batch) loss += total_loss self.unclipped_grads.add_sample(unclipped_grads) self.loss.add_sample(loss) return loss
def train(self): episode = self.current_episode_buffer # check if we should calculate gradients or skip num_steps_passed_since_last_update = episode.length( ) - self.last_gradient_update_step_idx is_t_max_steps_passed = num_steps_passed_since_last_update >= self.ap.algorithm.num_steps_between_gradient_updates if not (is_t_max_steps_passed or episode.is_complete): return 0 total_loss = 0 if num_steps_passed_since_last_update > 0: for network in self.networks.values(): network.set_is_training(True) # we need to update the returns of the episode until now episode.update_returns() # get t_max transitions or less if the we got to a terminal state # will be used for both actor-critic and vanilla PG. # In order to get full episodes, Vanilla PG will set the end_idx to a very big value. transitions = episode[self.last_gradient_update_step_idx:] batch = Batch(transitions) # move the pointer for the last update step if episode.is_complete: self.last_gradient_update_step_idx = 0 else: self.last_gradient_update_step_idx = episode.length() # update the statistics for the variance reduction techniques if self.policy_gradient_rescaler in \ [PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE, PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP]: self.update_episode_statistics(episode) # accumulate the gradients total_loss, losses, unclipped_grads = self.learn_from_batch(batch) print(total_loss, losses, unclipped_grads) # apply the gradients once in every apply_gradients_every_x_episodes episodes if self.current_episode % self.ap.algorithm.apply_gradients_every_x_episodes == 0: for network in self.networks.values(): network.apply_gradients_and_sync_networks() self.training_iteration += 1 for network in self.networks.values(): network.set_is_training(False) # run additional commands after the training is done self.post_training_commands() return total_loss
def train(self): episode = self.get_current_episode() # check if we should calculate gradients or skip episode_ended = episode.is_complete num_steps_passed_since_last_update = episode.length( ) - self.last_gradient_update_step_idx is_t_max_steps_passed = num_steps_passed_since_last_update >= self.ap.algorithm.num_steps_between_gradient_updates if not (is_t_max_steps_passed or episode_ended): return 0 total_loss = 0 if num_steps_passed_since_last_update > 0: # we need to update the returns of the episode until now episode.update_returns() # get t_max transitions or less if the we got to a terminal state # will be used for both actor-critic and vanilla PG. # # In order to get full episodes, Vanilla PG will set the end_idx to a very big value. transitions = [] start_idx = self.last_gradient_update_step_idx end_idx = episode.length() for idx in range(start_idx, end_idx): transitions.append(episode.get_transition(idx)) self.last_gradient_update_step_idx = end_idx # update the statistics for the variance reduction techniques if self.policy_gradient_rescaler in \ [PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE, PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP]: self.update_episode_statistics(episode) # accumulate the gradients and apply them once in every apply_gradients_every_x_episodes episodes batch = Batch(transitions) total_loss, losses, unclipped_grads = self.learn_from_batch(batch) if self.current_episode % self.ap.algorithm.apply_gradients_every_x_episodes == 0: for network in self.networks.values(): network.apply_gradients_and_sync_networks() self.training_iteration += 1 # move the pointer to the next episode start and discard the episode. if episode_ended: # we need to remove the episode, because the next training iteration will be called before storing any # additional transitions in the memory (we don't store a transition for the first call to observe), so the # length of the memory won't be enforced and the old episode won't be removed self.call_memory('remove_episode', 0) self.last_gradient_update_step_idx = 0 return total_loss
def generate_goal(self): if self.memory.num_transitions() == 0: return transitions = list(np.random.choice(self.memory.transitions, min(self.ap.algorithm.rnd_sample_size, self.memory.num_transitions()), replace=False)) dataset = Batch(transitions) batch_size = self.ap.algorithm.rnd_batch_size self.goal = dataset[0] max_novelty = 0 for i in range(int(dataset.size / batch_size)): start = i * batch_size end = (i + 1) * batch_size novelty = self.calculate_novelty(Batch(dataset[start:end])) curr_max = np.max(novelty) if curr_max > max_novelty: max_novelty = curr_max idx = start + np.argmax(novelty) self.goal = dataset[idx]
def learn_from_batch(self, batch): # perform on-policy training iteration total_loss, losses, unclipped_grads = self._learn_from_batch(batch) if self.ap.algorithm.ratio_of_replay > 0 \ and self.memory.num_transitions() > self.ap.algorithm.num_transitions_to_start_replay: n = np.random.poisson(self.ap.algorithm.ratio_of_replay) # perform n off-policy training iterations for _ in range(n): new_batch = Batch(self.call_memory('sample', (self.ap.algorithm.num_steps_between_gradient_updates, True))) result = self._learn_from_batch(new_batch) total_loss += result[0] losses += result[1] unclipped_grads += result[2] return total_loss, losses, unclipped_grads
def fill_advantages(self, batch): batch = Batch(batch) network_keys = self.ap.network_wrappers[ 'critic'].input_embedders_parameters.keys() # * Found not to have any impact * # current_states_with_timestep = self.concat_state_and_timestep(batch) current_state_values = self.networks['critic'].online_network.predict( batch.states(network_keys)).squeeze() total_returns = batch.n_step_discounted_rewards() # calculate advantages advantages = [] if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE: advantages = total_returns - current_state_values elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: # get bootstraps episode_start_idx = 0 advantages = np.array([]) # current_state_values[batch.game_overs()] = 0 for idx, game_over in enumerate(batch.game_overs()): if game_over: # get advantages for the rollout value_bootstrapping = np.zeros((1, )) rollout_state_values = np.append( current_state_values[episode_start_idx:idx + 1], value_bootstrapping) rollout_advantages, _ = \ self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1], rollout_state_values) episode_start_idx = idx + 1 advantages = np.append(advantages, rollout_advantages) else: screen.warning( "WARNING: The requested policy gradient rescaler is not available" ) # standardize advantages = (advantages - np.mean(advantages)) / np.std(advantages) # TODO: this will be problematic with a shared memory for transition, advantage in zip(self.memory.transitions, advantages): transition.info['advantage'] = advantage self.action_advantages.add_sample(advantages)
def handle_self_supervised_reward(self, batch): batch_size = self.ap.network_wrappers['actor'].batch_size episode_indices = np.random.randint(self.memory.num_complete_episodes(), size=batch_size) transitions = [] for e_idx in episode_indices: episode = self.memory.get_all_complete_episodes()[e_idx] transition_idx = np.random.randint(episode.length()) t = copy.copy(episode[transition_idx]) if np.random.rand(1) < self.ap.algorithm.identity_goal_sample_rate: t.state[self.ap.algorithm.agent_obs_key] = self.concat_goal(t.state, t.state) # this doesn't matter for learning but is set anyway so that the agent can pass it through the network t.next_state[self.ap.algorithm.agent_obs_key] = self.concat_goal(t.next_state, t.state) t.game_over = True t.reward = 0 t.action = np.zeros_like(t.action) else: if transition_idx == episode.length() - 1: goal = t t.state[self.ap.algorithm.agent_obs_key] = self.concat_goal(t.state, t.next_state) t.next_state[self.ap.algorithm.agent_obs_key] = self.concat_goal(t.next_state, t.next_state) else: goal_idx = np.random.randint(transition_idx, episode.length()) goal = episode.transitions[goal_idx] t.state[self.ap.algorithm.agent_obs_key] = self.concat_goal(t.state, episode.transitions[goal_idx].next_state) t.next_state[self.ap.algorithm.agent_obs_key] = self.concat_goal(t.next_state, episode.transitions[goal_idx].next_state) camera_equal = np.alltrue(np.equal(t.next_state[self.ap.algorithm.env_obs_key], goal.next_state[self.ap.algorithm.env_obs_key])) measurements_equal = np.alltrue(np.isclose(t.next_state['measurements'], goal.next_state['measurements'])) t.game_over = camera_equal and measurements_equal t.reward = -1 transitions.append(t) return Batch(transitions)
def improve_reward_model(self, epochs: int): """ Train a reward model to be used by the doubly-robust estimator :param epochs: The total number of epochs to use for training a reward model :return: None """ batch_size = self.ap.network_wrappers['reward_model'].batch_size # this is fitted from the training dataset for epoch in range(epochs): loss = 0 total_transitions_processed = 0 for i, batch in enumerate( self.call_memory('get_shuffled_training_data_generator', batch_size)): batch = Batch(batch) loss += self.get_reward_model_loss(batch) total_transitions_processed += batch.size log = OrderedDict() log['Epoch'] = epoch log['loss'] = loss / total_transitions_processed screen.log_dict(log, prefix='Training Reward Model')
def train_multiagent( self, agents): # we overwrite train() to handle the multi-agent case # return Agent.train(self) loss = 0 if self._should_train(): if self.ap.is_batch_rl_training: # when training an agent for generating a dataset in batch-rl, we don't want it to be counted as part of # the training epochs. we only care for training epochs in batch-rl anyway. self.training_epoch += 1 for network in self.networks.values(): network.set_is_training(True) # At the moment we only support a single batch size for all the networks networks_parameters = list(self.ap.network_wrappers.values()) assert all(net.batch_size == networks_parameters[0].batch_size for net in networks_parameters) batch_size = networks_parameters[0].batch_size # get prepared for sample_with_index transitions_idx = np.random.randint( self.num_transitions_in_complete_episodes(), size=batch_size) # get prepared for get_shuffled_training_data_generator_with_index # we suppose that all agents having the same get_last_training_set_transition_id shuffled_transition_indices = list( range(self.memory.get_last_training_set_transition_id())) random.shuffle(shuffled_transition_indices) # we either go sequentially through the entire replay buffer in the batch RL mode, # or sample randomly for the basic RL case. training_schedules = [] for i in range(self.n): if self.ap.is_batch_rl_training: training_schedules.append(agents[i].call_memory( 'get_shuffled_training_data_generator_with_index', batch_size, shuffled_transition_indices)) else: training_schedules.append([ agents[i].call_memory('sample_with_index', transitions_idx) for _ in range(self.ap.algorithm.num_consecutive_training_steps) ]) training_schedule = training_schedules[ self.agent_index] # get its own training_schedule # tmp_obs = np.array([]) # tmp_act = np.array([]) # tmp_next_obs = np.array([]) # tmp_next_act = np.array([]) # for i in range(self.n): # actor_i = agents[i].networks['actor'+str(i)] # tmp_next_act_all = actor_i.parallel_prediction( # [(actor_i.online_network, training_schedules[i].states('observation'))]) # for tmp_batch in training_schedules[i]: # tmp_obs = np.concatenate((tmp_obs, tmp_batch.state['observation']), axis=0) if tmp_obs.size else tmp_batch.state['observation'] # tmp_act = np.concatenate((tmp_act, tmp_batch.action), axis=0) if tmp_act.size else tmp_batch.action # tmp_next_obs = np.concatenate((tmp_next_obs, tmp_batch.state['observation']), axis=0) if tmp_next_obs.size else tmp_batch.state['observation'] # tmp_next_act = np.concatenate((tmp_next_act, tmp_batch.state['observation']), axis=0) if tmp_next_obs.size else tmp_batch.state['observation'] tmp_curr_mean_act_all = [] tmp_next_act_all = [] for i in range(self.n): actor_i = agents[i].networks['actor' + str(i)] actor_keys = agents[i].ap.network_wrappers[ 'actor' + str(i)].input_embedders_parameters.keys() tmp_curr_mean_act_all_i, tmp_next_act_all_i = actor_i.parallel_prediction( [(actor_i.online_network, training_schedules[i].states(actor_keys)), (actor_i.target_network, training_schedules[i].next_states(actor_keys))]) tmp_curr_mean_act_all.append(tmp_curr_mean_act_all_i) tmp_next_act_all.append(tmp_next_act_all_i) # update the training_schedule of the current agent for t in len(training_schedule): tmp_obs = np.array([]) tmp_act = np.array([]) tmp_curr_mean_act = np.array([]) tmp_next_obs = np.array([]) tmp_next_act = np.array([]) for i in range(self.n): # for tmp_batch in training_schedules[i]: tmp_batch = training_schedules[i] tmp_obs = np.concatenate((tmp_obs, tmp_batch.state['observation']), axis=0) if tmp_obs.size else \ tmp_batch.state['observation'] tmp_act = np.concatenate( (tmp_act, tmp_batch.action), axis=0) if tmp_act.size else tmp_batch.action tmp_curr_mean_act = np.concatenate( (tmp_curr_mean_act, tmp_curr_mean_act_all[i][t]), axis=0 ) if tmp_next_obs.size else tmp_curr_mean_act_all[i][t] tmp_next_obs = np.concatenate( (tmp_next_obs, tmp_batch.state['observation']), axis=0 ) if tmp_next_obs.size else tmp_batch.state['observation'] tmp_next_act = np.concatenate( (tmp_next_act, tmp_next_act_all[i][t]), axis=0 ) if tmp_next_obs.size else tmp_next_act_all[i][t] # note that the difference between action_n and mean_action_n is that the former is from the batch data (off-policy); while the latter comes from the current network training_schedule[t].state['observation_n'] = tmp_obs training_schedule[t].state['action_n'] = tmp_act training_schedule[t].state['mean_action_n'] = tmp_curr_mean_act # training_schedule[t].action = tmp_act # we include both the joint observation and joint action in the "next_state" training_schedule[t].next_state['observation_n'] = tmp_next_obs training_schedule[t].next_state['action_n'] = tmp_next_act # new_info = {'action': tmp_act} # training_schedule[t].update_info(new_info) for batch in training_schedule: # update counters self.training_iteration += 1 if self.pre_network_filter is not None: batch = self.pre_network_filter.filter( batch, update_internal_state=False, deep_copy=False) # if the batch returned empty then there are not enough samples in the replay buffer -> skip # training step if len(batch) > 0: # train batch = Batch(batch) total_loss, losses, unclipped_grads = self.learn_from_batch( batch) loss += total_loss self.unclipped_grads.add_sample(unclipped_grads) # TODO: this only deals with the main network (if exists), need to do the same for other networks # for instance, for DDPG, the LR signal is currently not shown. Probably should be done through the # network directly instead of here # decay learning rate if 'main' in self.ap.network_wrappers and \ self.ap.network_wrappers['main'].learning_rate_decay_rate != 0: self.curr_learning_rate.add_sample( self.networks['main'].sess.run( self.networks['main'].online_network. current_learning_rate)) else: self.curr_learning_rate.add_sample( networks_parameters[0].learning_rate) if any([network.has_target for network in self.networks.values()]) \ and self._should_update_online_weights_to_target(): for network in self.networks.values(): network.update_target_network( self.ap.algorithm. rate_for_copying_weights_to_target) self.agent_logger.create_signal_value( 'Update Target Network', 1) else: self.agent_logger.create_signal_value( 'Update Target Network', 0, overwrite=False) self.loss.add_sample(loss) if self.imitation: self.log_to_screen() if self.ap.visualization.dump_csv and \ self.parent_level_manager.parent_graph_manager.time_metric == TimeTypes.Epoch: # in BatchRL, or imitation learning, the agent never acts, so we have to get the stats out here. # we dump the data out every epoch self.update_log() for network in self.networks.values(): network.set_is_training(False) # run additional commands after the training is done self.post_training_commands() return loss
def train_policy_network(self, dataset, epochs): loss = [] for j in range(epochs): loss = { 'total_loss': [], 'policy_losses': [], 'unclipped_grads': [], 'fetch_result': [] } #shuffle(dataset) for i in range( len(dataset) // self.ap.network_wrappers['actor'].batch_size): batch = Batch( dataset[i * self.ap.network_wrappers['actor'].batch_size:(i + 1) * self.ap.network_wrappers['actor'].batch_size]) network_keys = self.ap.network_wrappers[ 'actor'].input_embedders_parameters.keys() advantages = batch.info('advantage') actions = batch.actions() if not isinstance(self.spaces.action, DiscreteActionSpace) and len( actions.shape) == 1: actions = np.expand_dims(actions, -1) # get old policy probabilities and distribution old_policy = force_list( self.networks['actor'].target_network.predict( batch.states(network_keys))) # calculate gradients and apply on both the local policy network and on the global policy network fetches = [ self.networks['actor'].online_network.output_heads[0]. kl_divergence, self.networks['actor'].online_network. output_heads[0].entropy ] inputs = copy.copy(batch.states(network_keys)) inputs['output_0_0'] = actions # old_policy_distribution needs to be represented as a list, because in the event of discrete controls, # it has just a mean. otherwise, it has both a mean and standard deviation for input_index, input in enumerate(old_policy): inputs['output_0_{}'.format(input_index + 1)] = input total_loss, policy_losses, unclipped_grads, fetch_result =\ self.networks['actor'].online_network.accumulate_gradients( inputs, [advantages], additional_fetches=fetches) self.networks['actor'].apply_gradients_to_online_network() if isinstance(self.ap.task_parameters, DistributedTaskParameters): self.networks['actor'].apply_gradients_to_global_network() self.networks[ 'actor'].online_network.reset_accumulated_gradients() loss['total_loss'].append(total_loss) loss['policy_losses'].append(policy_losses) loss['unclipped_grads'].append(unclipped_grads) loss['fetch_result'].append(fetch_result) self.unclipped_grads.add_sample(unclipped_grads) for key in loss.keys(): loss[key] = np.mean(loss[key], 0) if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0: curr_learning_rate = self.networks[ 'critic'].online_network.get_variable_value( self.ap.learning_rate) self.curr_learning_rate.add_sample(curr_learning_rate) else: curr_learning_rate = self.ap.network_wrappers[ 'critic'].learning_rate # log training parameters screen.log_dict(OrderedDict([ ("Surrogate loss", loss['policy_losses'][0]), ("KL divergence", loss['fetch_result'][0]), ("Entropy", loss['fetch_result'][1]), ("training epoch", j), ("learning_rate", curr_learning_rate) ]), prefix="Policy training") self.total_kl_divergence_during_training_process = loss[ 'fetch_result'][0] self.entropy.add_sample(loss['fetch_result'][1]) self.kl_divergence.add_sample(loss['fetch_result'][0]) return loss['total_loss']
def handle_episode_ended(self) -> None: super().handle_episode_ended() novelty = self.calculate_novelty(Batch(self.memory.get_last_complete_episode().transitions)) self.rnd_stats.push_val(np.expand_dims(self.update_intrinsic_returns_estimate(novelty), -1))
def improve_reward_model(self, epochs: int): """ Train both a reward model to be used by the doubly-robust estimator, and some model to be used for BCQ :param epochs: The total number of epochs to use for training a reward model :return: None """ # we'll be assuming that these gets drawn from the reward model parameters batch_size = self.ap.network_wrappers['reward_model'].batch_size network_keys = self.ap.network_wrappers['reward_model'].input_embedders_parameters.keys() # if using a NN to decide which actions to drop, we'll train the NN here if isinstance(self.ap.algorithm.action_drop_method_parameters, NNImitationModelParameters): total_epochs = max(epochs, self.ap.algorithm.action_drop_method_parameters.imitation_model_num_epochs) else: total_epochs = epochs for epoch in range(total_epochs): # this is fitted from the training dataset reward_model_loss = 0 imitation_model_loss = 0 total_transitions_processed = 0 for i, batch in enumerate(self.call_memory('get_shuffled_training_data_generator', batch_size)): batch = Batch(batch) # reward model if epoch < epochs: reward_model_loss += self.get_reward_model_loss(batch) # imitation model if isinstance(self.ap.algorithm.action_drop_method_parameters, NNImitationModelParameters) and \ epoch < self.ap.algorithm.action_drop_method_parameters.imitation_model_num_epochs: target_actions = np.zeros((batch.size, len(self.spaces.action.actions))) target_actions[range(batch.size), batch.actions()] = 1 imitation_model_loss += self.networks['imitation_model'].train_and_sync_networks( batch.states(network_keys), target_actions)[0] total_transitions_processed += batch.size log = OrderedDict() log['Epoch'] = epoch if reward_model_loss: log['Reward Model Loss'] = reward_model_loss / total_transitions_processed if imitation_model_loss: log['Imitation Model Loss'] = imitation_model_loss / total_transitions_processed screen.log_dict(log, prefix='Training Batch RL Models') # if using a kNN based model, we'll initialize and build it here. # initialization cannot be moved to the constructor as we don't have the agent's spaces initialized yet. if isinstance(self.ap.algorithm.action_drop_method_parameters, KNNParameters): knn_size = self.ap.algorithm.action_drop_method_parameters.knn_size if self.ap.algorithm.action_drop_method_parameters.use_state_embedding_instead_of_state: self.knn_trees = [AnnoyDictionary( dict_size=knn_size, key_width=int(self.networks['reward_model'].online_network.state_embedding.shape[-1]), batch_size=knn_size) for _ in range(len(self.spaces.action.actions))] else: self.knn_trees = [AnnoyDictionary( dict_size=knn_size, key_width=self.spaces.state['observation'].shape[0], batch_size=knn_size) for _ in range(len(self.spaces.action.actions))] for i, knn_tree in enumerate(self.knn_trees): state_embeddings = self.embedding([transition.state for transition in self.memory.transitions if transition.action == i]) knn_tree.add( keys=state_embeddings, values=np.expand_dims(np.zeros(state_embeddings.shape[0]), axis=1)) for knn_tree in self.knn_trees: knn_tree._rebuild_index() self.average_dist = [[dist[0] for dist in knn_tree._get_k_nearest_neighbors_indices( keys=self.embedding([transition.state for transition in self.memory.transitions]), k=1)[0]] for knn_tree in self.knn_trees] self.average_dist = sum([x for l in self.average_dist for x in l]) # flatten and sum self.average_dist /= len(self.memory.transitions)
def train(self): """ Check if a training phase should be done as configured by num_consecutive_playing_steps. If it should, then do several training steps as configured by num_consecutive_training_steps. A single training iteration: Sample a batch, train on it and update target networks. :return: The total training loss during the training iterations. """ loss = 0 if self._should_train(): for training_step in range( self.ap.algorithm.num_consecutive_training_steps): # TODO: this should be network dependent network_parameters = list(self.ap.network_wrappers.values())[0] # update counters self.training_iteration += 1 # sample a batch and train on it batch = self.call_memory('sample', network_parameters.batch_size) if self.pre_network_filter is not None: batch = self.pre_network_filter.filter( batch, update_internal_state=False, deep_copy=False) # if the batch returned empty then there are not enough samples in the replay buffer -> skip # training step if len(batch) > 0: # train batch = Batch(batch) total_loss, losses, unclipped_grads = self.learn_from_batch( batch) loss += total_loss self.unclipped_grads.add_sample(unclipped_grads) # TODO: the learning rate decay should be done through the network instead of here # decay learning rate if network_parameters.learning_rate_decay_rate != 0: self.curr_learning_rate.add_sample( self.networks['main'].sess.run( self.networks['main'].online_network. current_learning_rate)) else: self.curr_learning_rate.add_sample( network_parameters.learning_rate) if any([network.has_target for network in self.networks.values()]) \ and self._should_update_online_weights_to_target(): for network in self.networks.values(): network.update_target_network( self.ap.algorithm. rate_for_copying_weights_to_target) self.agent_logger.create_signal_value( 'Update Target Network', 1) else: self.agent_logger.create_signal_value( 'Update Target Network', 0, overwrite=False) self.loss.add_sample(loss) if self.imitation: self.log_to_screen() # run additional commands after the training is done self.post_training_commands() return loss
def _prepare_ope_shared_stats(dataset_as_transitions: List[Transition], batch_size: int, reward_model: Architecture, q_network: Architecture, network_keys: List) -> OpeSharedStats: """ Do the preparations needed for different estimators. Some of the calcuations are shared, so we centralize all the work here. :param dataset_as_transitions: The evaluation dataset in the form of transitions. :param batch_size: The batch size to use. :param reward_model: A reward model to be used by DR :param q_network: The Q network whose its policy we evaluate. :param network_keys: The network keys used for feeding the neural networks. :return: """ # IPS all_reward_model_rewards, all_policy_probs, all_old_policy_probs = [], [], [] all_v_values_reward_model_based, all_v_values_q_model_based, all_rewards, all_actions = [], [], [], [] for i in range(math.ceil(len(dataset_as_transitions) / batch_size)): batch = dataset_as_transitions[i * batch_size:(i + 1) * batch_size] batch_for_inference = Batch(batch) all_reward_model_rewards.append( reward_model.predict(batch_for_inference.states(network_keys))) # we always use the first Q head to calculate OPEs. might want to change this in the future. # for instance, this means that for bootstrapped we always use the first QHead to calculate the OPEs. q_values, sm_values = q_network.predict( batch_for_inference.states(network_keys), outputs=[ q_network.output_heads[0].q_values, q_network.output_heads[0].softmax ]) all_policy_probs.append(sm_values) all_v_values_reward_model_based.append( np.sum(all_policy_probs[-1] * all_reward_model_rewards[-1], axis=1)) all_v_values_q_model_based.append( np.sum(all_policy_probs[-1] * q_values, axis=1)) all_rewards.append(batch_for_inference.rewards()) all_actions.append(batch_for_inference.actions()) all_old_policy_probs.append( batch_for_inference.info('all_action_probabilities')[ range(len(batch_for_inference.actions())), batch_for_inference.actions()]) for j, t in enumerate(batch): t.update_info({ 'q_value': q_values[j], 'softmax_policy_prob': all_policy_probs[-1][j], 'v_value_q_model_based': all_v_values_q_model_based[-1][j], }) all_reward_model_rewards = np.concatenate(all_reward_model_rewards, axis=0) all_policy_probs = np.concatenate(all_policy_probs, axis=0) all_v_values_reward_model_based = np.concatenate( all_v_values_reward_model_based, axis=0) all_rewards = np.concatenate(all_rewards, axis=0) all_actions = np.concatenate(all_actions, axis=0) all_old_policy_probs = np.concatenate(all_old_policy_probs, axis=0) # generate model probabilities new_policy_prob = all_policy_probs[np.arange(all_actions.shape[0]), all_actions] rho_all_dataset = new_policy_prob / all_old_policy_probs return OpeSharedStats(all_reward_model_rewards, all_policy_probs, all_v_values_reward_model_based, all_rewards, all_actions, all_old_policy_probs, new_policy_prob, rho_all_dataset)
def learn_from_batch(self, batch): network_keys = self.ap.network_wrappers[ 'main'].input_embedders_parameters.keys() dataset = copy.deepcopy(self.memory.transitions) dataset = Batch(dataset) dataset.shuffle() if self.num_steps % 1024 == 0: for i in range( int(dataset.size / self.ap.network_wrappers['predictor'].batch_size)): start = i * self.ap.network_wrappers['predictor'].batch_size end = (i + 1) * self.ap.network_wrappers['predictor'].batch_size const_embedding = self.networks[ 'constant'].online_network.predict({ k: v[start:end] for k, v in dataset.next_states(network_keys).items() }) _ = self.networks['predictor'].train_and_sync_networks( copy.copy({ k: v[start:end] for k, v in dataset.next_states(network_keys).items() }), [const_embedding]) embedding = self.networks['constant'].online_network.predict( batch.next_states(network_keys)) prediction = self.networks['predictor'].online_network.predict( batch.next_states(network_keys)) prediction_error = np.mean((embedding - prediction)**2, axis=1) # self.rewards += list(prediction_error) # intrinsic_rewards = (prediction_error - np.mean(prediction_error)) / (np.std(prediction_error) + 1e-15) intrinsic_rewards = np.zeros_like(prediction_error) intrinsic_rewards[np.argmax(prediction_error)] = 1 selected_actions = np.argmax( self.networks['main'].online_network.predict( batch.next_states(network_keys)), 1) q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([ (self.networks['main'].target_network, batch.next_states(network_keys)), (self.networks['main'].online_network, batch.states(network_keys)) ]) # initialize with the current prediction so that we will # only update the action that we have actually done in this transition TD_errors = [] for i in range(self.ap.network_wrappers['main'].batch_size): new_target = intrinsic_rewards[i] + \ self.ap.algorithm.discount * q_st_plus_1[i][selected_actions[i]] TD_errors.append( np.abs(new_target - TD_targets[i, batch.actions()[i]])) TD_targets[i, batch.actions()[i]] = new_target # update errors in prioritized replay buffer importance_weights = self.update_transition_priorities_and_get_weights( TD_errors, batch) result = self.networks['main'].train_and_sync_networks( batch.states(network_keys), TD_targets, importance_weights=importance_weights) total_loss, losses, unclipped_grads = result[:3] return total_loss, losses, unclipped_grads