コード例 #1
0
    def save_rnd_images(self, dir_path=None):
        if dir_path is None:
            dir_path = os.path.join(self.parent_level_manager.parent_graph_manager.task_parameters.experiment_path,
                                    'rnd_images')
        else:
            dir_path = os.path.join(dir_path, 'rnd_images')
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
        transitions = self.memory.transitions
        dataset = Batch(transitions)
        batch_size = self.ap.algorithm.rnd_batch_size
        novelties = []
        for i in range(int(dataset.size / batch_size)):
            start = i * batch_size
            end = (i + 1) * batch_size

            batch = Batch(dataset[start:end])
            novelty = self.calculate_novelty(batch)
            novelties.append(novelty)
        novelties = np.concatenate(novelties)
        sorted_indices = np.argsort(novelties)
        sample_indices = sorted_indices[np.round(np.linspace(0, len(sorted_indices) - 1, 100)).astype(np.uint32)]
        images = []
        for si in sample_indices:
            images.append(np.flip(transitions[si].next_state[self.ap.algorithm.env_obs_key], 0))
        rows = []
        for i in range(10):
            rows.append(np.hstack(images[(i * 10):((i + 1) * 10)]))
        image = np.vstack(rows)
        image = Image.fromarray(image)
        image.save('{}/{}_{}.jpeg'.format(dir_path, 'rnd_samples', len(transitions)))
コード例 #2
0
    def train_rnd(self):
        if self.memory.num_transitions() == 0:
            return

        transitions = self.memory.transitions[-self.ap.algorithm.rnd_sample_size:]
        dataset = Batch(transitions)
        dataset_order = list(range(dataset.size))
        batch_size = self.ap.algorithm.rnd_batch_size
        for epoch in range(self.ap.algorithm.rnd_optimization_epochs):
            shuffle(dataset_order)
            total_loss = 0
            total_grads = 0
            for i in range(int(dataset.size / batch_size)):
                start = i * batch_size
                end = (i + 1) * batch_size

                batch = Batch(list(np.array(dataset.transitions)[dataset_order[start:end]]))
                inputs = self.prepare_rnd_inputs(batch)

                const_embedding = self.networks['constant'].online_network.predict(inputs)

                res = self.networks['predictor'].train_and_sync_networks(inputs, [const_embedding])

                total_loss += res[0]
                total_grads += res[2]

            screen.log_dict(
                OrderedDict([
                    ("training epoch", epoch),
                    ("dataset size", dataset.size),
                    ("mean loss", total_loss / dataset.size),
                    ("mean gradients", total_grads / dataset.size)
                ]),
                prefix="RND Training"
            )
コード例 #3
0
    def improve_reward_model(self, epochs: int):
        """
        Train a reward model to be used by the doubly-robust estimator

        :param epochs: The total number of epochs to use for training a reward model
        :return: None
        """
        batch_size = self.ap.network_wrappers['reward_model'].batch_size
        network_keys = self.ap.network_wrappers['reward_model'].input_embedders_parameters.keys()

        # this is fitted from the training dataset
        for epoch in range(epochs):
            loss = 0
            for i, batch in enumerate(self.call_memory('get_shuffled_data_generator', batch_size)):
                batch = Batch(batch)
                current_rewards_prediction_for_all_actions = self.networks['reward_model'].online_network.predict(batch.states(network_keys))
                current_rewards_prediction_for_all_actions[range(batch_size), batch.actions()] = batch.rewards()
                loss += self.networks['reward_model'].train_and_sync_networks(
                    batch.states(network_keys), current_rewards_prediction_for_all_actions)[0]
            # print(self.networks['reward_model'].online_network.predict(batch.states(network_keys))[0])

            log = OrderedDict()
            log['Epoch'] = epoch
            log['loss'] = loss / int(self.call_memory('num_transitions_in_complete_episodes') / batch_size)
            screen.log_dict(log, prefix='Training Reward Model')
コード例 #4
0
    def train(self):
        if self._should_train():
            for network in self.networks.values():
                network.set_is_training(True)

            dataset = self.memory.transitions
            dataset = self.pre_network_filter.filter(dataset, deep_copy=False)
            batch = Batch(dataset)

            for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
                self.networks['main'].sync()
                self.fill_advantages(batch)

                # take only the requested number of steps
                if isinstance(self.ap.algorithm.num_consecutive_playing_steps, EnvironmentSteps):
                    dataset = dataset[:self.ap.algorithm.num_consecutive_playing_steps.num_steps]
                shuffle(dataset)
                batch = Batch(dataset)

                self.train_network(batch, self.ap.algorithm.optimization_epochs)

            for network in self.networks.values():
                network.set_is_training(False)

            self.post_training_commands()
            self.training_iteration += 1
            # should be done in order to update the data that has been accumulated * while not playing *
            self.update_log()
            return None
コード例 #5
0
    def train_value_network(self, dataset, epochs):
        loss = []
        batch = Batch(dataset)
        network_keys = self.ap.network_wrappers[
            'critic'].input_embedders_parameters.keys()

        # * Found not to have any impact *
        # add a timestep to the observation
        # current_states_with_timestep = self.concat_state_and_timestep(dataset)

        mix_fraction = self.ap.algorithm.value_targets_mix_fraction
        total_returns = batch.n_step_discounted_rewards(True)
        for j in range(epochs):
            curr_batch_size = batch.size
            if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
                curr_batch_size = self.ap.network_wrappers['critic'].batch_size
            for i in range(batch.size // curr_batch_size):
                # split to batches for first order optimization techniques
                current_states_batch = {
                    k: v[i * curr_batch_size:(i + 1) * curr_batch_size]
                    for k, v in batch.states(network_keys).items()
                }
                total_return_batch = total_returns[i *
                                                   curr_batch_size:(i + 1) *
                                                   curr_batch_size]
                old_policy_values = force_list(
                    self.networks['critic'].target_network.predict(
                        current_states_batch).squeeze())
                if self.networks[
                        'critic'].online_network.optimizer_type != 'LBFGS':
                    targets = total_return_batch
                else:
                    current_values = self.networks[
                        'critic'].online_network.predict(current_states_batch)
                    targets = current_values * (
                        1 - mix_fraction) + total_return_batch * mix_fraction

                inputs = copy.copy(current_states_batch)
                for input_index, input in enumerate(old_policy_values):
                    name = 'output_0_{}'.format(input_index)
                    if name in self.networks['critic'].online_network.inputs:
                        inputs[name] = input

                value_loss = self.networks[
                    'critic'].online_network.accumulate_gradients(
                        inputs, targets)

                self.networks['critic'].apply_gradients_to_online_network()
                if isinstance(self.ap.task_parameters,
                              DistributedTaskParameters):
                    self.networks['critic'].apply_gradients_to_global_network()
                self.networks[
                    'critic'].online_network.reset_accumulated_gradients()

                loss.append([value_loss[0]])
        loss = np.mean(loss, 0)
        return loss
コード例 #6
0
    def get_reward_model_loss(self, batch: Batch):
        network_keys = self.ap.network_wrappers[
            'reward_model'].input_embedders_parameters.keys()
        current_rewards_prediction_for_all_actions = self.networks[
            'reward_model'].online_network.predict(batch.states(network_keys))
        current_rewards_prediction_for_all_actions[
            range(batch.size), batch.actions()] = batch.rewards()

        return self.networks['reward_model'].train_and_sync_networks(
            batch.states(network_keys),
            current_rewards_prediction_for_all_actions)[0]
    def gather_static_shared_stats(
            self, evaluation_dataset_as_transitions: List[Transition],
            batch_size: int, reward_model: Architecture,
            network_keys: List) -> None:
        all_reward_model_rewards = []
        all_old_policy_probs = []
        all_rewards = []
        all_actions = []

        for i in range(
                math.ceil(len(evaluation_dataset_as_transitions) /
                          batch_size)):
            batch = evaluation_dataset_as_transitions[i * batch_size:(i + 1) *
                                                      batch_size]
            batch_for_inference = Batch(batch)

            all_reward_model_rewards.append(
                reward_model.predict(batch_for_inference.states(network_keys)))
            all_rewards.append(batch_for_inference.rewards())
            all_actions.append(batch_for_inference.actions())
            all_old_policy_probs.append(
                batch_for_inference.info('all_action_probabilities')[
                    range(len(batch_for_inference.actions())),
                    batch_for_inference.actions()])

        self.all_reward_model_rewards = np.concatenate(
            all_reward_model_rewards, axis=0)
        self.all_old_policy_probs = np.concatenate(all_old_policy_probs,
                                                   axis=0)
        self.all_rewards = np.concatenate(all_rewards, axis=0)
        self.all_actions = np.concatenate(all_actions, axis=0)

        # mark that static shared data was collected and ready to be used
        self.is_gathered_static_shared_data = True
コード例 #8
0
    def train_off_policy(self):
        loss = 0

        # TODO: this should be network dependent!
        network_parameters = list(self.ap.network_wrappers.values())[0]

        # update counters
        self.training_iteration += 1

        # sample a batch and train on it
        batch = self.call_memory('sample', network_parameters.batch_size)
        if self.pre_network_filter is not None:
            batch = self.pre_network_filter.filter(batch,
                                                   update_internal_state=False,
                                                   deep_copy=False)

        # if the batch returned empty then there are not enough samples in the replay buffer -> skip
        # training step
        if len(batch) > 0:
            # train
            batch = Batch(batch)
            total_loss, losses, unclipped_grads = self.learn_from_batch_off_policy(
                batch)
            loss += total_loss
            self.unclipped_grads.add_sample(unclipped_grads)
            self.loss.add_sample(loss)

        return loss
コード例 #9
0
    def train(self):
        episode = self.current_episode_buffer

        # check if we should calculate gradients or skip
        num_steps_passed_since_last_update = episode.length(
        ) - self.last_gradient_update_step_idx
        is_t_max_steps_passed = num_steps_passed_since_last_update >= self.ap.algorithm.num_steps_between_gradient_updates
        if not (is_t_max_steps_passed or episode.is_complete):
            return 0

        total_loss = 0
        if num_steps_passed_since_last_update > 0:
            for network in self.networks.values():
                network.set_is_training(True)

            # we need to update the returns of the episode until now
            episode.update_returns()

            # get t_max transitions or less if the we got to a terminal state
            # will be used for both actor-critic and vanilla PG.
            # In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
            transitions = episode[self.last_gradient_update_step_idx:]
            batch = Batch(transitions)

            # move the pointer for the last update step
            if episode.is_complete:
                self.last_gradient_update_step_idx = 0
            else:
                self.last_gradient_update_step_idx = episode.length()

            # update the statistics for the variance reduction techniques
            if self.policy_gradient_rescaler in \
                    [PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE,
                     PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP]:
                self.update_episode_statistics(episode)

            # accumulate the gradients
            total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
            print(total_loss, losses, unclipped_grads)

            # apply the gradients once in every apply_gradients_every_x_episodes episodes
            if self.current_episode % self.ap.algorithm.apply_gradients_every_x_episodes == 0:
                for network in self.networks.values():
                    network.apply_gradients_and_sync_networks()
            self.training_iteration += 1

            for network in self.networks.values():
                network.set_is_training(False)

            # run additional commands after the training is done
            self.post_training_commands()

        return total_loss
コード例 #10
0
    def train(self):
        episode = self.get_current_episode()

        # check if we should calculate gradients or skip
        episode_ended = episode.is_complete
        num_steps_passed_since_last_update = episode.length(
        ) - self.last_gradient_update_step_idx
        is_t_max_steps_passed = num_steps_passed_since_last_update >= self.ap.algorithm.num_steps_between_gradient_updates
        if not (is_t_max_steps_passed or episode_ended):
            return 0

        total_loss = 0
        if num_steps_passed_since_last_update > 0:

            # we need to update the returns of the episode until now
            episode.update_returns()

            # get t_max transitions or less if the we got to a terminal state
            # will be used for both actor-critic and vanilla PG.
            # # In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
            transitions = []
            start_idx = self.last_gradient_update_step_idx
            end_idx = episode.length()

            for idx in range(start_idx, end_idx):
                transitions.append(episode.get_transition(idx))
            self.last_gradient_update_step_idx = end_idx

            # update the statistics for the variance reduction techniques
            if self.policy_gradient_rescaler in \
                    [PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE,
                     PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP]:
                self.update_episode_statistics(episode)

            # accumulate the gradients and apply them once in every apply_gradients_every_x_episodes episodes
            batch = Batch(transitions)
            total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
            if self.current_episode % self.ap.algorithm.apply_gradients_every_x_episodes == 0:
                for network in self.networks.values():
                    network.apply_gradients_and_sync_networks()
            self.training_iteration += 1

        # move the pointer to the next episode start and discard the episode.
        if episode_ended:
            # we need to remove the episode, because the next training iteration will be called before storing any
            # additional transitions in the memory (we don't store a transition for the first call to observe), so the
            # length of the memory won't be enforced and the old episode won't be removed
            self.call_memory('remove_episode', 0)
            self.last_gradient_update_step_idx = 0

        return total_loss
コード例 #11
0
    def generate_goal(self):
        if self.memory.num_transitions() == 0:
            return

        transitions = list(np.random.choice(self.memory.transitions,
                                            min(self.ap.algorithm.rnd_sample_size,
                                                self.memory.num_transitions()),
                                            replace=False))
        dataset = Batch(transitions)
        batch_size = self.ap.algorithm.rnd_batch_size
        self.goal = dataset[0]

        max_novelty = 0
        for i in range(int(dataset.size / batch_size)):
            start = i * batch_size
            end = (i + 1) * batch_size

            novelty = self.calculate_novelty(Batch(dataset[start:end]))

            curr_max = np.max(novelty)
            if curr_max > max_novelty:
                max_novelty = curr_max
                idx = start + np.argmax(novelty)
                self.goal = dataset[idx]
    def learn_from_batch(self, batch):
        # perform on-policy training iteration
        total_loss, losses, unclipped_grads = self._learn_from_batch(batch)

        if self.ap.algorithm.ratio_of_replay > 0 \
                and self.memory.num_transitions() > self.ap.algorithm.num_transitions_to_start_replay:
            n = np.random.poisson(self.ap.algorithm.ratio_of_replay)
            # perform n off-policy training iterations
            for _ in range(n):
                new_batch = Batch(self.call_memory('sample', (self.ap.algorithm.num_steps_between_gradient_updates, True)))
                result = self._learn_from_batch(new_batch)
                total_loss += result[0]
                losses += result[1]
                unclipped_grads += result[2]

        return total_loss, losses, unclipped_grads
コード例 #13
0
    def fill_advantages(self, batch):
        batch = Batch(batch)
        network_keys = self.ap.network_wrappers[
            'critic'].input_embedders_parameters.keys()

        # * Found not to have any impact *
        # current_states_with_timestep = self.concat_state_and_timestep(batch)

        current_state_values = self.networks['critic'].online_network.predict(
            batch.states(network_keys)).squeeze()
        total_returns = batch.n_step_discounted_rewards()
        # calculate advantages
        advantages = []
        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
            advantages = total_returns - current_state_values
        elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
            # get bootstraps
            episode_start_idx = 0
            advantages = np.array([])
            # current_state_values[batch.game_overs()] = 0
            for idx, game_over in enumerate(batch.game_overs()):
                if game_over:
                    # get advantages for the rollout
                    value_bootstrapping = np.zeros((1, ))
                    rollout_state_values = np.append(
                        current_state_values[episode_start_idx:idx + 1],
                        value_bootstrapping)

                    rollout_advantages, _ = \
                        self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
                                                                     rollout_state_values)
                    episode_start_idx = idx + 1
                    advantages = np.append(advantages, rollout_advantages)
        else:
            screen.warning(
                "WARNING: The requested policy gradient rescaler is not available"
            )

        # standardize
        advantages = (advantages - np.mean(advantages)) / np.std(advantages)

        # TODO: this will be problematic with a shared memory
        for transition, advantage in zip(self.memory.transitions, advantages):
            transition.info['advantage'] = advantage

        self.action_advantages.add_sample(advantages)
コード例 #14
0
    def handle_self_supervised_reward(self, batch):
        batch_size = self.ap.network_wrappers['actor'].batch_size
        episode_indices = np.random.randint(self.memory.num_complete_episodes(), size=batch_size)
        transitions = []
        for e_idx in episode_indices:
            episode = self.memory.get_all_complete_episodes()[e_idx]
            transition_idx = np.random.randint(episode.length())
            t = copy.copy(episode[transition_idx])
            if np.random.rand(1) < self.ap.algorithm.identity_goal_sample_rate:
                t.state[self.ap.algorithm.agent_obs_key] = self.concat_goal(t.state, t.state)
                # this doesn't matter for learning but is set anyway so that the agent can pass it through the network
                t.next_state[self.ap.algorithm.agent_obs_key] = self.concat_goal(t.next_state, t.state)
                t.game_over = True
                t.reward = 0
                t.action = np.zeros_like(t.action)
            else:
                if transition_idx == episode.length() - 1:
                    goal = t
                    t.state[self.ap.algorithm.agent_obs_key] = self.concat_goal(t.state, t.next_state)
                    t.next_state[self.ap.algorithm.agent_obs_key] = self.concat_goal(t.next_state, t.next_state)
                else:
                    goal_idx = np.random.randint(transition_idx, episode.length())
                    goal = episode.transitions[goal_idx]
                    t.state[self.ap.algorithm.agent_obs_key] = self.concat_goal(t.state, episode.transitions[goal_idx].next_state)
                    t.next_state[self.ap.algorithm.agent_obs_key] = self.concat_goal(t.next_state,
                                                                        episode.transitions[goal_idx].next_state)

                camera_equal = np.alltrue(np.equal(t.next_state[self.ap.algorithm.env_obs_key],
                                                   goal.next_state[self.ap.algorithm.env_obs_key]))
                measurements_equal = np.alltrue(np.isclose(t.next_state['measurements'],
                                                           goal.next_state['measurements']))
                t.game_over = camera_equal and measurements_equal
                t.reward = -1

            transitions.append(t)

        return Batch(transitions)
コード例 #15
0
    def improve_reward_model(self, epochs: int):
        """
        Train a reward model to be used by the doubly-robust estimator

        :param epochs: The total number of epochs to use for training a reward model
        :return: None
        """
        batch_size = self.ap.network_wrappers['reward_model'].batch_size

        # this is fitted from the training dataset
        for epoch in range(epochs):
            loss = 0
            total_transitions_processed = 0
            for i, batch in enumerate(
                    self.call_memory('get_shuffled_training_data_generator',
                                     batch_size)):
                batch = Batch(batch)
                loss += self.get_reward_model_loss(batch)
                total_transitions_processed += batch.size

            log = OrderedDict()
            log['Epoch'] = epoch
            log['loss'] = loss / total_transitions_processed
            screen.log_dict(log, prefix='Training Reward Model')
コード例 #16
0
    def train_multiagent(
            self,
            agents):  # we overwrite train() to handle the multi-agent case
        # return Agent.train(self)
        loss = 0
        if self._should_train():
            if self.ap.is_batch_rl_training:
                # when training an agent for generating a dataset in batch-rl, we don't want it to be counted as part of
                # the training epochs. we only care for training epochs in batch-rl anyway.
                self.training_epoch += 1
            for network in self.networks.values():
                network.set_is_training(True)

            # At the moment we only support a single batch size for all the networks
            networks_parameters = list(self.ap.network_wrappers.values())
            assert all(net.batch_size == networks_parameters[0].batch_size
                       for net in networks_parameters)

            batch_size = networks_parameters[0].batch_size

            # get prepared for sample_with_index
            transitions_idx = np.random.randint(
                self.num_transitions_in_complete_episodes(), size=batch_size)

            # get prepared for get_shuffled_training_data_generator_with_index
            # we suppose that all agents having the same get_last_training_set_transition_id
            shuffled_transition_indices = list(
                range(self.memory.get_last_training_set_transition_id()))
            random.shuffle(shuffled_transition_indices)

            # we either go sequentially through the entire replay buffer in the batch RL mode,
            # or sample randomly for the basic RL case.
            training_schedules = []
            for i in range(self.n):
                if self.ap.is_batch_rl_training:
                    training_schedules.append(agents[i].call_memory(
                        'get_shuffled_training_data_generator_with_index',
                        batch_size, shuffled_transition_indices))
                else:
                    training_schedules.append([
                        agents[i].call_memory('sample_with_index',
                                              transitions_idx) for _ in
                        range(self.ap.algorithm.num_consecutive_training_steps)
                    ])

            training_schedule = training_schedules[
                self.agent_index]  # get its own training_schedule
            # tmp_obs = np.array([])
            # tmp_act = np.array([])
            # tmp_next_obs = np.array([])
            # tmp_next_act = np.array([])
            # for i in range(self.n):
            #     actor_i = agents[i].networks['actor'+str(i)]
            #     tmp_next_act_all = actor_i.parallel_prediction(
            #         [(actor_i.online_network, training_schedules[i].states('observation'))])
            #     for tmp_batch in training_schedules[i]:
            #         tmp_obs = np.concatenate((tmp_obs, tmp_batch.state['observation']), axis=0) if tmp_obs.size else tmp_batch.state['observation']
            #         tmp_act = np.concatenate((tmp_act, tmp_batch.action), axis=0) if tmp_act.size else tmp_batch.action
            #         tmp_next_obs = np.concatenate((tmp_next_obs, tmp_batch.state['observation']), axis=0) if tmp_next_obs.size else tmp_batch.state['observation']
            #         tmp_next_act = np.concatenate((tmp_next_act, tmp_batch.state['observation']), axis=0) if tmp_next_obs.size else tmp_batch.state['observation']

            tmp_curr_mean_act_all = []
            tmp_next_act_all = []
            for i in range(self.n):
                actor_i = agents[i].networks['actor' + str(i)]
                actor_keys = agents[i].ap.network_wrappers[
                    'actor' + str(i)].input_embedders_parameters.keys()
                tmp_curr_mean_act_all_i, tmp_next_act_all_i = actor_i.parallel_prediction(
                    [(actor_i.online_network,
                      training_schedules[i].states(actor_keys)),
                     (actor_i.target_network,
                      training_schedules[i].next_states(actor_keys))])
                tmp_curr_mean_act_all.append(tmp_curr_mean_act_all_i)
                tmp_next_act_all.append(tmp_next_act_all_i)

            # update the training_schedule of the current agent
            for t in len(training_schedule):
                tmp_obs = np.array([])
                tmp_act = np.array([])
                tmp_curr_mean_act = np.array([])
                tmp_next_obs = np.array([])
                tmp_next_act = np.array([])
                for i in range(self.n):
                    # for tmp_batch in training_schedules[i]:
                    tmp_batch = training_schedules[i]
                    tmp_obs = np.concatenate((tmp_obs, tmp_batch.state['observation']), axis=0) if tmp_obs.size else \
                    tmp_batch.state['observation']
                    tmp_act = np.concatenate(
                        (tmp_act, tmp_batch.action),
                        axis=0) if tmp_act.size else tmp_batch.action
                    tmp_curr_mean_act = np.concatenate(
                        (tmp_curr_mean_act, tmp_curr_mean_act_all[i][t]),
                        axis=0
                    ) if tmp_next_obs.size else tmp_curr_mean_act_all[i][t]
                    tmp_next_obs = np.concatenate(
                        (tmp_next_obs, tmp_batch.state['observation']), axis=0
                    ) if tmp_next_obs.size else tmp_batch.state['observation']
                    tmp_next_act = np.concatenate(
                        (tmp_next_act, tmp_next_act_all[i][t]), axis=0
                    ) if tmp_next_obs.size else tmp_next_act_all[i][t]

                # note that the difference between action_n and mean_action_n is that the former is from the batch data (off-policy); while the latter comes from the current network
                training_schedule[t].state['observation_n'] = tmp_obs
                training_schedule[t].state['action_n'] = tmp_act
                training_schedule[t].state['mean_action_n'] = tmp_curr_mean_act
                # training_schedule[t].action = tmp_act
                # we include both the joint observation and joint action in the "next_state"
                training_schedule[t].next_state['observation_n'] = tmp_next_obs
                training_schedule[t].next_state['action_n'] = tmp_next_act
                # new_info = {'action': tmp_act}
                # training_schedule[t].update_info(new_info)

            for batch in training_schedule:
                # update counters
                self.training_iteration += 1
                if self.pre_network_filter is not None:
                    batch = self.pre_network_filter.filter(
                        batch, update_internal_state=False, deep_copy=False)

                # if the batch returned empty then there are not enough samples in the replay buffer -> skip
                # training step
                if len(batch) > 0:
                    # train
                    batch = Batch(batch)
                    total_loss, losses, unclipped_grads = self.learn_from_batch(
                        batch)
                    loss += total_loss

                    self.unclipped_grads.add_sample(unclipped_grads)

                    # TODO: this only deals with the main network (if exists), need to do the same for other networks
                    #  for instance, for DDPG, the LR signal is currently not shown. Probably should be done through the
                    #  network directly instead of here
                    # decay learning rate
                    if 'main' in self.ap.network_wrappers and \
                            self.ap.network_wrappers['main'].learning_rate_decay_rate != 0:
                        self.curr_learning_rate.add_sample(
                            self.networks['main'].sess.run(
                                self.networks['main'].online_network.
                                current_learning_rate))
                    else:
                        self.curr_learning_rate.add_sample(
                            networks_parameters[0].learning_rate)

                    if any([network.has_target for network in self.networks.values()]) \
                            and self._should_update_online_weights_to_target():
                        for network in self.networks.values():
                            network.update_target_network(
                                self.ap.algorithm.
                                rate_for_copying_weights_to_target)

                        self.agent_logger.create_signal_value(
                            'Update Target Network', 1)
                    else:
                        self.agent_logger.create_signal_value(
                            'Update Target Network', 0, overwrite=False)

                    self.loss.add_sample(loss)

                    if self.imitation:
                        self.log_to_screen()

            if self.ap.visualization.dump_csv and \
                    self.parent_level_manager.parent_graph_manager.time_metric == TimeTypes.Epoch:
                # in BatchRL, or imitation learning, the agent never acts, so we have to get the stats out here.
                # we dump the data out every epoch
                self.update_log()

            for network in self.networks.values():
                network.set_is_training(False)

            # run additional commands after the training is done
            self.post_training_commands()

        return loss
コード例 #17
0
    def train_policy_network(self, dataset, epochs):
        loss = []
        for j in range(epochs):
            loss = {
                'total_loss': [],
                'policy_losses': [],
                'unclipped_grads': [],
                'fetch_result': []
            }
            #shuffle(dataset)
            for i in range(
                    len(dataset) //
                    self.ap.network_wrappers['actor'].batch_size):
                batch = Batch(
                    dataset[i *
                            self.ap.network_wrappers['actor'].batch_size:(i +
                                                                          1) *
                            self.ap.network_wrappers['actor'].batch_size])

                network_keys = self.ap.network_wrappers[
                    'actor'].input_embedders_parameters.keys()

                advantages = batch.info('advantage')
                actions = batch.actions()
                if not isinstance(self.spaces.action,
                                  DiscreteActionSpace) and len(
                                      actions.shape) == 1:
                    actions = np.expand_dims(actions, -1)

                # get old policy probabilities and distribution
                old_policy = force_list(
                    self.networks['actor'].target_network.predict(
                        batch.states(network_keys)))

                # calculate gradients and apply on both the local policy network and on the global policy network
                fetches = [
                    self.networks['actor'].online_network.output_heads[0].
                    kl_divergence, self.networks['actor'].online_network.
                    output_heads[0].entropy
                ]

                inputs = copy.copy(batch.states(network_keys))
                inputs['output_0_0'] = actions

                # old_policy_distribution needs to be represented as a list, because in the event of discrete controls,
                # it has just a mean. otherwise, it has both a mean and standard deviation
                for input_index, input in enumerate(old_policy):
                    inputs['output_0_{}'.format(input_index + 1)] = input

                total_loss, policy_losses, unclipped_grads, fetch_result =\
                    self.networks['actor'].online_network.accumulate_gradients(
                        inputs, [advantages], additional_fetches=fetches)

                self.networks['actor'].apply_gradients_to_online_network()
                if isinstance(self.ap.task_parameters,
                              DistributedTaskParameters):
                    self.networks['actor'].apply_gradients_to_global_network()

                self.networks[
                    'actor'].online_network.reset_accumulated_gradients()

                loss['total_loss'].append(total_loss)
                loss['policy_losses'].append(policy_losses)
                loss['unclipped_grads'].append(unclipped_grads)
                loss['fetch_result'].append(fetch_result)

                self.unclipped_grads.add_sample(unclipped_grads)

            for key in loss.keys():
                loss[key] = np.mean(loss[key], 0)

            if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0:
                curr_learning_rate = self.networks[
                    'critic'].online_network.get_variable_value(
                        self.ap.learning_rate)
                self.curr_learning_rate.add_sample(curr_learning_rate)
            else:
                curr_learning_rate = self.ap.network_wrappers[
                    'critic'].learning_rate

            # log training parameters
            screen.log_dict(OrderedDict([
                ("Surrogate loss", loss['policy_losses'][0]),
                ("KL divergence", loss['fetch_result'][0]),
                ("Entropy", loss['fetch_result'][1]), ("training epoch", j),
                ("learning_rate", curr_learning_rate)
            ]),
                            prefix="Policy training")

        self.total_kl_divergence_during_training_process = loss[
            'fetch_result'][0]
        self.entropy.add_sample(loss['fetch_result'][1])
        self.kl_divergence.add_sample(loss['fetch_result'][0])
        return loss['total_loss']
コード例 #18
0
 def handle_episode_ended(self) -> None:
     super().handle_episode_ended()
     novelty = self.calculate_novelty(Batch(self.memory.get_last_complete_episode().transitions))
     self.rnd_stats.push_val(np.expand_dims(self.update_intrinsic_returns_estimate(novelty), -1))
コード例 #19
0
ファイル: ddqn_bcq_agent.py プロジェクト: samstan/coach
    def improve_reward_model(self, epochs: int):
        """
        Train both a reward model to be used by the doubly-robust estimator, and some model to be used for BCQ

        :param epochs: The total number of epochs to use for training a reward model
        :return: None
        """

        # we'll be assuming that these gets drawn from the reward model parameters
        batch_size = self.ap.network_wrappers['reward_model'].batch_size
        network_keys = self.ap.network_wrappers['reward_model'].input_embedders_parameters.keys()

        # if using a NN to decide which actions to drop, we'll train the NN here
        if isinstance(self.ap.algorithm.action_drop_method_parameters, NNImitationModelParameters):
            total_epochs = max(epochs, self.ap.algorithm.action_drop_method_parameters.imitation_model_num_epochs)
        else:
            total_epochs = epochs

        for epoch in range(total_epochs):
            # this is fitted from the training dataset
            reward_model_loss = 0
            imitation_model_loss = 0
            total_transitions_processed = 0
            for i, batch in enumerate(self.call_memory('get_shuffled_training_data_generator', batch_size)):
                batch = Batch(batch)

                # reward model
                if epoch < epochs:
                    reward_model_loss += self.get_reward_model_loss(batch)

                # imitation model
                if isinstance(self.ap.algorithm.action_drop_method_parameters, NNImitationModelParameters) and \
                        epoch < self.ap.algorithm.action_drop_method_parameters.imitation_model_num_epochs:
                    target_actions = np.zeros((batch.size, len(self.spaces.action.actions)))
                    target_actions[range(batch.size), batch.actions()] = 1
                    imitation_model_loss += self.networks['imitation_model'].train_and_sync_networks(
                        batch.states(network_keys), target_actions)[0]

                total_transitions_processed += batch.size

            log = OrderedDict()
            log['Epoch'] = epoch

            if reward_model_loss:
                log['Reward Model Loss'] = reward_model_loss / total_transitions_processed
            if imitation_model_loss:
                log['Imitation Model Loss'] = imitation_model_loss / total_transitions_processed

            screen.log_dict(log, prefix='Training Batch RL Models')

        # if using a kNN based model, we'll initialize and build it here.
        # initialization cannot be moved to the constructor as we don't have the agent's spaces initialized yet.
        if isinstance(self.ap.algorithm.action_drop_method_parameters, KNNParameters):
            knn_size = self.ap.algorithm.action_drop_method_parameters.knn_size
            if self.ap.algorithm.action_drop_method_parameters.use_state_embedding_instead_of_state:
                self.knn_trees = [AnnoyDictionary(
                    dict_size=knn_size,
                    key_width=int(self.networks['reward_model'].online_network.state_embedding.shape[-1]),
                    batch_size=knn_size)
                    for _ in range(len(self.spaces.action.actions))]
            else:
                self.knn_trees = [AnnoyDictionary(
                    dict_size=knn_size,
                    key_width=self.spaces.state['observation'].shape[0],
                    batch_size=knn_size)
                    for _ in range(len(self.spaces.action.actions))]

            for i, knn_tree in enumerate(self.knn_trees):
                state_embeddings = self.embedding([transition.state for transition in self.memory.transitions
                                if transition.action == i])
                knn_tree.add(
                    keys=state_embeddings,
                    values=np.expand_dims(np.zeros(state_embeddings.shape[0]), axis=1))

            for knn_tree in self.knn_trees:
                knn_tree._rebuild_index()

            self.average_dist = [[dist[0] for dist in knn_tree._get_k_nearest_neighbors_indices(
                keys=self.embedding([transition.state for transition in self.memory.transitions]),
                k=1)[0]] for knn_tree in self.knn_trees]
            self.average_dist = sum([x for l in self.average_dist for x in l])  # flatten and sum
            self.average_dist /= len(self.memory.transitions)
コード例 #20
0
ファイル: agent.py プロジェクト: mdavala/coach
    def train(self):
        """
        Check if a training phase should be done as configured by num_consecutive_playing_steps.
        If it should, then do several training steps as configured by num_consecutive_training_steps.
        A single training iteration: Sample a batch, train on it and update target networks.
        :return: The total training loss during the training iterations.
        """
        loss = 0
        if self._should_train():
            for training_step in range(
                    self.ap.algorithm.num_consecutive_training_steps):
                # TODO: this should be network dependent
                network_parameters = list(self.ap.network_wrappers.values())[0]

                # update counters
                self.training_iteration += 1

                # sample a batch and train on it
                batch = self.call_memory('sample',
                                         network_parameters.batch_size)
                if self.pre_network_filter is not None:
                    batch = self.pre_network_filter.filter(
                        batch, update_internal_state=False, deep_copy=False)

                # if the batch returned empty then there are not enough samples in the replay buffer -> skip
                # training step
                if len(batch) > 0:
                    # train
                    batch = Batch(batch)
                    total_loss, losses, unclipped_grads = self.learn_from_batch(
                        batch)
                    loss += total_loss
                    self.unclipped_grads.add_sample(unclipped_grads)

                    # TODO: the learning rate decay should be done through the network instead of here
                    # decay learning rate
                    if network_parameters.learning_rate_decay_rate != 0:
                        self.curr_learning_rate.add_sample(
                            self.networks['main'].sess.run(
                                self.networks['main'].online_network.
                                current_learning_rate))
                    else:
                        self.curr_learning_rate.add_sample(
                            network_parameters.learning_rate)

                    if any([network.has_target for network in self.networks.values()]) \
                            and self._should_update_online_weights_to_target():
                        for network in self.networks.values():
                            network.update_target_network(
                                self.ap.algorithm.
                                rate_for_copying_weights_to_target)

                        self.agent_logger.create_signal_value(
                            'Update Target Network', 1)
                    else:
                        self.agent_logger.create_signal_value(
                            'Update Target Network', 0, overwrite=False)

                    self.loss.add_sample(loss)

                    if self.imitation:
                        self.log_to_screen()

            # run additional commands after the training is done
            self.post_training_commands()

        return loss
コード例 #21
0
ファイル: ope_manager.py プロジェクト: sarikayamehmet/coach
    def _prepare_ope_shared_stats(dataset_as_transitions: List[Transition],
                                  batch_size: int, reward_model: Architecture,
                                  q_network: Architecture,
                                  network_keys: List) -> OpeSharedStats:
        """
        Do the preparations needed for different estimators.
        Some of the calcuations are shared, so we centralize all the work here.

        :param dataset_as_transitions: The evaluation dataset in the form of transitions.
        :param batch_size: The batch size to use.
        :param reward_model: A reward model to be used by DR
        :param q_network: The Q network whose its policy we evaluate.
        :param network_keys: The network keys used for feeding the neural networks.
        :return:
        """
        # IPS
        all_reward_model_rewards, all_policy_probs, all_old_policy_probs = [], [], []
        all_v_values_reward_model_based, all_v_values_q_model_based, all_rewards, all_actions = [], [], [], []

        for i in range(math.ceil(len(dataset_as_transitions) / batch_size)):
            batch = dataset_as_transitions[i * batch_size:(i + 1) * batch_size]
            batch_for_inference = Batch(batch)

            all_reward_model_rewards.append(
                reward_model.predict(batch_for_inference.states(network_keys)))

            # we always use the first Q head to calculate OPEs. might want to change this in the future.
            # for instance, this means that for bootstrapped we always use the first QHead to calculate the OPEs.
            q_values, sm_values = q_network.predict(
                batch_for_inference.states(network_keys),
                outputs=[
                    q_network.output_heads[0].q_values,
                    q_network.output_heads[0].softmax
                ])

            all_policy_probs.append(sm_values)
            all_v_values_reward_model_based.append(
                np.sum(all_policy_probs[-1] * all_reward_model_rewards[-1],
                       axis=1))
            all_v_values_q_model_based.append(
                np.sum(all_policy_probs[-1] * q_values, axis=1))
            all_rewards.append(batch_for_inference.rewards())
            all_actions.append(batch_for_inference.actions())
            all_old_policy_probs.append(
                batch_for_inference.info('all_action_probabilities')[
                    range(len(batch_for_inference.actions())),
                    batch_for_inference.actions()])

            for j, t in enumerate(batch):
                t.update_info({
                    'q_value':
                    q_values[j],
                    'softmax_policy_prob':
                    all_policy_probs[-1][j],
                    'v_value_q_model_based':
                    all_v_values_q_model_based[-1][j],
                })

        all_reward_model_rewards = np.concatenate(all_reward_model_rewards,
                                                  axis=0)
        all_policy_probs = np.concatenate(all_policy_probs, axis=0)
        all_v_values_reward_model_based = np.concatenate(
            all_v_values_reward_model_based, axis=0)
        all_rewards = np.concatenate(all_rewards, axis=0)
        all_actions = np.concatenate(all_actions, axis=0)
        all_old_policy_probs = np.concatenate(all_old_policy_probs, axis=0)

        # generate model probabilities
        new_policy_prob = all_policy_probs[np.arange(all_actions.shape[0]),
                                           all_actions]
        rho_all_dataset = new_policy_prob / all_old_policy_probs

        return OpeSharedStats(all_reward_model_rewards, all_policy_probs,
                              all_v_values_reward_model_based, all_rewards,
                              all_actions, all_old_policy_probs,
                              new_policy_prob, rho_all_dataset)
コード例 #22
0
ファイル: ddqn_rnd_agent.py プロジェクト: shadiendrawis/coach
    def learn_from_batch(self, batch):
        network_keys = self.ap.network_wrappers[
            'main'].input_embedders_parameters.keys()

        dataset = copy.deepcopy(self.memory.transitions)
        dataset = Batch(dataset)
        dataset.shuffle()
        if self.num_steps % 1024 == 0:
            for i in range(
                    int(dataset.size /
                        self.ap.network_wrappers['predictor'].batch_size)):
                start = i * self.ap.network_wrappers['predictor'].batch_size
                end = (i +
                       1) * self.ap.network_wrappers['predictor'].batch_size

                const_embedding = self.networks[
                    'constant'].online_network.predict({
                        k: v[start:end]
                        for k, v in dataset.next_states(network_keys).items()
                    })

                _ = self.networks['predictor'].train_and_sync_networks(
                    copy.copy({
                        k: v[start:end]
                        for k, v in dataset.next_states(network_keys).items()
                    }), [const_embedding])

        embedding = self.networks['constant'].online_network.predict(
            batch.next_states(network_keys))
        prediction = self.networks['predictor'].online_network.predict(
            batch.next_states(network_keys))
        prediction_error = np.mean((embedding - prediction)**2, axis=1)
        # self.rewards += list(prediction_error)
        # intrinsic_rewards = (prediction_error - np.mean(prediction_error)) / (np.std(prediction_error) + 1e-15)
        intrinsic_rewards = np.zeros_like(prediction_error)
        intrinsic_rewards[np.argmax(prediction_error)] = 1

        selected_actions = np.argmax(
            self.networks['main'].online_network.predict(
                batch.next_states(network_keys)), 1)
        q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
            (self.networks['main'].target_network,
             batch.next_states(network_keys)),
            (self.networks['main'].online_network, batch.states(network_keys))
        ])

        # initialize with the current prediction so that we will
        #  only update the action that we have actually done in this transition
        TD_errors = []
        for i in range(self.ap.network_wrappers['main'].batch_size):
            new_target = intrinsic_rewards[i] + \
                         self.ap.algorithm.discount * q_st_plus_1[i][selected_actions[i]]
            TD_errors.append(
                np.abs(new_target - TD_targets[i, batch.actions()[i]]))
            TD_targets[i, batch.actions()[i]] = new_target

        # update errors in prioritized replay buffer
        importance_weights = self.update_transition_priorities_and_get_weights(
            TD_errors, batch)

        result = self.networks['main'].train_and_sync_networks(
            batch.states(network_keys),
            TD_targets,
            importance_weights=importance_weights)
        total_loss, losses, unclipped_grads = result[:3]

        return total_loss, losses, unclipped_grads