Ejemplo n.º 1
0
    def __init__(self,
                 *args,
                 name="vpg_maml",
                 learning_rate=1e-3,
                 inner_type='likelihood_ratio',
                 exploration=False,
                 rollouts_per_meta_task=None,
                 e_maml_sum=False,
                 **kwargs):
        super(VPGMAML, self).__init__(*args, **kwargs)
        assert inner_type in ["log_likelihood", "likelihood_ratio"]

        self.optimizer = MAMLFirstOrderOptimizer(learning_rate=learning_rate)
        self.inner_type = inner_type
        self._optimization_keys = [
            'observations', 'actions', 'advantages', 'agent_infos'
        ]
        self.name = name

        self.rollouts_per_meta_task = rollouts_per_meta_task
        self.exploration = exploration
        if exploration:  # add adjusted average rewards tp optimization keys
            if not e_maml_sum:
                self.rollouts_per_meta_task = 1
            self._optimization_keys.append('adj_avg_rewards')

        self.build_graph()
Ejemplo n.º 2
0
    def testSine(self):
        np.random.seed(65)
        for optimizer in [MAMLFirstOrderOptimizer()]:
            tf.reset_default_graph()
            with tf.Session():
                input_phs = tf.placeholder(dtype=tf.float32, shape=[None, 1])
                target_phs = tf.placeholder(dtype=tf.float32, shape=[None, 1])
                network = Mlp(input_phs, 1, hidden_size=(32, 32), name='sin')
                loss = tf.reduce_mean(tf.square(network.output - target_phs))
                input_ph_dict = OrderedDict({'x': input_phs, 'y': target_phs})
                optimizer.build_graph(loss, network, input_ph_dict)
                sess = tf.get_default_session()
                sess.run(tf.global_variables_initializer())

                for i in range(5000):
                    xs = np.random.normal(0, 3, (1000, 1))
                    ys = np.sin(xs)
                    inputs = {'x': xs, 'y': ys}
                    optimizer.optimize(inputs)
                    if i % 100 == 0:
                        print(optimizer.loss(inputs))

                xs = np.random.normal(0, 3, (100, 1))
                ys = np.sin(xs)
                y_pred = sess.run(
                    network.output,
                    feed_dict=dict(list(zip(input_ph_dict.values(),
                                            (xs, ys)))))
                self.assertLessEqual(np.mean((ys - y_pred)**2), 0.02)
Ejemplo n.º 3
0
    def __init__(
            self,
            max_path_length,
            *args,
            name="dice_maml",
            learning_rate=1e-3,
            **kwargs
            ):
        super(DICEMAML, self).__init__(*args, **kwargs)

        self.optimizer = MAMLFirstOrderOptimizer(learning_rate=learning_rate)
        self.max_path_length = max_path_length
        self._optimization_keys = ['observations', 'actions', 'adjusted_rewards', 'mask', 'agent_infos']
        self.name = name

        self.build_graph()
Ejemplo n.º 4
0
    def __init__(self,
                 *args,
                 name="vpg_maml",
                 learning_rate=1e-3,
                 inner_type='likelihood_ratio',
                 exploration=False,
                 **kwargs):
        super(VPGMAML, self).__init__(*args, **kwargs)
        assert inner_type in ["log_likelihood", "likelihood_ratio"]

        self.optimizer = MAMLFirstOrderOptimizer(learning_rate=learning_rate)
        self.inner_type = inner_type
        self._optimization_keys = [
            'observations', 'actions', 'advantages', 'agent_infos'
        ]
        self.name = name
        self.exploration = exploration

        self.build_graph()
Ejemplo n.º 5
0
    def __init__(self,
                 *args,
                 name="vpg",
                 learning_rate=1e-3,
                 inner_type='likelihood_ratio',
                 **kwargs):
        super(VPG, self).__init__(*args, **kwargs)
        assert inner_type in ["log_likelihood", "likelihood_ratio"]

        self.inner_type = inner_type
        self.recurrent = getattr(self.policy, 'recurrent', False)
        if self.recurrent:
            self.optimizer = RL2FirstOrderOptimizer(
                learning_rate=learning_rate)
        else:
            self.optimizer = MAMLFirstOrderOptimizer(
                learning_rate=learning_rate)
        self._optimization_keys = [
            'observations', 'actions', 'advantages', 'agent_infos'
        ]
        self.name = name

        self.build_graph()
Ejemplo n.º 6
0
    def __init__(self,
                 *args,
                 name="ppo",
                 learning_rate=1e-3,
                 clip_eps=0.2,
                 max_epochs=5,
                 **kwargs):
        super(PPO, self).__init__(*args, **kwargs)

        self.recurrent = getattr(self.policy, 'recurrent', False)
        if self.recurrent:
            self.optimizer = RL2FirstOrderOptimizer(
                learning_rate=learning_rate, max_epochs=max_epochs)
        else:
            self.optimizer = MAMLFirstOrderOptimizer(
                learning_rate=learning_rate, max_epochs=max_epochs)
        self._optimization_keys = [
            'observations', 'actions', 'advantages', 'agent_infos'
        ]
        self.name = name
        self._clip_eps = clip_eps

        self.build_graph()
Ejemplo n.º 7
0
class VPG(Algo):
    """
    Algorithm for PPO MAML

    Args:
        policy (Policy): policy object
        name (str): tf variable scope
        learning_rate (float): learning rate for the meta-objective
        exploration (bool): use exploration / pre-update sampling term / E-MAML term
        inner_type (str): inner optimization objective - either log_likelihood or likelihood_ratio
        inner_lr (float) : gradient step size used for inner step
        meta_batch_size (int): number of meta-learning tasks
        num_inner_grad_steps (int) : number of gradient updates taken per maml iteration
        trainable_inner_step_size (boolean): whether make the inner step size a trainable variable
    """
    def __init__(self,
                 *args,
                 name="vpg",
                 learning_rate=1e-3,
                 inner_type='likelihood_ratio',
                 **kwargs):
        super(VPG, self).__init__(*args, **kwargs)
        assert inner_type in ["log_likelihood", "likelihood_ratio"]

        self.inner_type = inner_type
        self.recurrent = getattr(self.policy, 'recurrent', False)
        if self.recurrent:
            self.optimizer = RL2FirstOrderOptimizer(
                learning_rate=learning_rate)
        else:
            self.optimizer = MAMLFirstOrderOptimizer(
                learning_rate=learning_rate)
        self._optimization_keys = [
            'observations', 'actions', 'advantages', 'agent_infos'
        ]
        self.name = name

        self.build_graph()

    def build_graph(self):
        """
        Creates the computation graph

        Notes:
            Pseudocode:
            for task in meta_batch_size:
                make_vars
                init_init_dist_sym
            for step in num_inner_grad_steps:
                for task in meta_batch_size:
                    make_vars
                    update_init_dist_sym
            set objectives for optimizer
        """
        """ Create Variables """
        """ ----- Build graph for the meta-update ----- """
        self.meta_op_phs_dict = OrderedDict()
        obs_ph, action_ph, adv_ph, dist_info_old_ph, all_phs_dict = self._make_input_placeholders(
            'train', recurrent=self.recurrent)
        self.meta_op_phs_dict.update(all_phs_dict)

        # dist_info_vars_for_next_step
        if self.recurrent:
            distribution_info_vars, hidden_ph, next_hidden_var = self.policy.distribution_info_sym(
                obs_ph)
        else:
            distribution_info_vars = self.policy.distribution_info_sym(obs_ph)
            hidden_ph, next_hidden_var = None, None
        """ Outer objective """
        # meta-objective
        if self.inner_type == 'log_likelihood':
            log_likelihood = self.policy.distribution.log_likelihood_sym(
                action_ph, distribution_info_vars)
            surr_obj = -tf.reduce_mean(log_likelihood * adv_ph)
        elif self.inner_type == 'likelihood_ratio':
            likelihood_ratio_adapt = self.policy.distribution.likelihood_ratio_sym(
                action_ph, dist_info_old_ph, distribution_info_vars)
            surr_obj = -tf.reduce_mean(likelihood_ratio_adapt * adv_ph)
        else:
            raise NotImplementedError

        self.optimizer.build_graph(loss=surr_obj,
                                   target=self.policy,
                                   input_ph_dict=self.meta_op_phs_dict,
                                   hidden_ph=hidden_ph,
                                   next_hidden_var=next_hidden_var)

    def optimize_policy(self, samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        input_dict = self._extract_input_dict(samples_data,
                                              self._optimization_keys,
                                              prefix='train')

        if log: logger.log("Optimizing")
        loss_before = self.optimizer.optimize(input_val_dict=input_dict)

        if log: logger.log("Computing statistics")
        loss_after = self.optimizer.loss(input_val_dict=input_dict)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
Ejemplo n.º 8
0
class VPGMAML(MAMLAlgo):
    """
    Algorithm for VPG MAML

    Args:
        policy (Policy): policy object
        name (str): tf variable scope
        learning_rate (float): learning rate for the meta-objective
        exploration (bool): use exploration / pre-update sampling term / E-MAML term
        rollouts_per_meta_task (int): number of trajectories per meta-task
        e_maml_sum (bool): wheter to use sum of mean over pre-update trajectories for e-maml
        inner_type (str): inner optimization objective - either log_likelihood or likelihood_ratio
        inner_lr (float) : gradient step size used for inner step
        meta_batch_size (int): number of meta-learning tasks
        num_inner_grad_steps (int) : number of gradient updates taken per maml iteration
        trainable_inner_step_size (boolean): whether make the inner step size a trainable variable
    """
    def __init__(self,
                 *args,
                 name="vpg_maml",
                 learning_rate=1e-3,
                 inner_type='likelihood_ratio',
                 exploration=False,
                 rollouts_per_meta_task=None,
                 e_maml_sum=False,
                 **kwargs):
        super(VPGMAML, self).__init__(*args, **kwargs)
        assert inner_type in ["log_likelihood", "likelihood_ratio"]

        self.optimizer = MAMLFirstOrderOptimizer(learning_rate=learning_rate)
        self.inner_type = inner_type
        self._optimization_keys = [
            'observations', 'actions', 'advantages', 'agent_infos'
        ]
        self.name = name

        self.rollouts_per_meta_task = rollouts_per_meta_task
        self.exploration = exploration
        if exploration:  # add adjusted average rewards tp optimization keys
            if not e_maml_sum:
                self.rollouts_per_meta_task = 1
            self._optimization_keys.append('adj_avg_rewards')

        self.build_graph()

    def _adapt_objective_sym(self, action_sym, adv_sym, dist_info_old_sym,
                             dist_info_new_sym):
        if self.inner_type == 'likelihood_ratio':
            with tf.variable_scope("likelihood_ratio"):
                likelihood_ratio_adapt = self.policy.distribution.likelihood_ratio_sym(
                    action_sym, dist_info_old_sym, dist_info_new_sym)
            with tf.variable_scope("surrogate_loss"):
                surr_obj_adapt = -tf.reduce_mean(
                    likelihood_ratio_adapt * adv_sym)

        elif self.inner_type == 'log_likelihood':
            with tf.variable_scope("log_likelihood"):
                log_likelihood_adapt = self.policy.distribution.log_likelihood_sym(
                    action_sym, dist_info_new_sym)
            with tf.variable_scope("surrogate_loss"):
                surr_obj_adapt = -tf.reduce_mean(
                    log_likelihood_adapt * adv_sym)

        else:
            raise NotImplementedError

        return surr_obj_adapt

    def build_graph(self):
        """
        Creates the computation graph
        """
        """ Create Variables """
        with tf.variable_scope(self.name):
            self.step_sizes = self._create_step_size_vars()
            """ --- Build inner update graph for adapting the policy and sampling trajectories --- """
            # this graph is only used for adapting the policy and not computing the meta-updates
            self.adapted_policies_params, self.adapt_input_ph_dict = self._build_inner_adaption(
            )
            """ ----- Build graph for the meta-update ----- """
            self.meta_op_phs_dict = OrderedDict()
            obs_phs, action_phs, adv_phs, dist_info_old_phs, all_phs_dict = self._make_input_placeholders(
                'step0')
            self.meta_op_phs_dict.update(all_phs_dict)

            distribution_info_vars, current_policy_params = [], []
            all_surr_objs = []

        for i in range(self.meta_batch_size):
            dist_info_sym = self.policy.distribution_info_sym(obs_phs[i],
                                                              params=None)
            distribution_info_vars.append(dist_info_sym)  # step 0
            current_policy_params.append(
                self.policy.policy_params
            )  # set to real policy_params (tf.Variable)

        initial_distribution_info_vars = distribution_info_vars
        initial_action_phs = action_phs

        with tf.variable_scope(self.name):
            """ Inner updates"""
            for step_id in range(1, self.num_inner_grad_steps + 1):
                surr_objs, adapted_policy_params = [], []

                # inner adaptation step for each task
                for i in range(self.meta_batch_size):
                    surr_loss = self._adapt_objective_sym(
                        action_phs[i], adv_phs[i], dist_info_old_phs[i],
                        distribution_info_vars[i])

                    adapted_params_var = self._adapt_sym(
                        surr_loss, current_policy_params[i])

                    adapted_policy_params.append(adapted_params_var)
                    surr_objs.append(surr_loss)

                all_surr_objs.append(surr_objs)
                # Create new placeholders for the next step
                obs_phs, action_phs, adv_phs, dist_info_old_phs, all_phs_dict = self._make_input_placeholders(
                    'step%i' % step_id)
                self.meta_op_phs_dict.update(all_phs_dict)

                # dist_info_vars_for_next_step
                distribution_info_vars = [
                    self.policy.distribution_info_sym(
                        obs_phs[i], params=adapted_policy_params[i])
                    for i in range(self.meta_batch_size)
                ]
                current_policy_params = adapted_policy_params
            """ Outer objective """
            surr_objs = []

            # meta-objective
            for i in range(self.meta_batch_size):
                log_likelihood = self.policy.distribution.log_likelihood_sym(
                    action_phs[i], distribution_info_vars[i])
                surr_obj = -tf.reduce_mean(log_likelihood * adv_phs[i])

                if self.exploration:
                    # add adj_avg_reward placeholder
                    adj_avg_rewards = tf.placeholder(
                        dtype=tf.float32,
                        shape=[None],
                        name='adj_avg_rewards' + '_' +
                        str(self.num_inner_grad_steps) + '_' + str(i))
                    self.meta_op_phs_dict[
                        'step%i_task%i_%s' %
                        (self.num_inner_grad_steps, i,
                         'adj_avg_rewards')] = adj_avg_rewards

                    log_likelihood_inital = self.policy.distribution.log_likelihood_sym(
                        initial_action_phs[i],
                        initial_distribution_info_vars[i])
                    surr_obj += -tf.reduce_mean(
                        adj_avg_rewards) * tf.reduce_mean(
                            log_likelihood_inital
                        ) * self.rollouts_per_meta_task

                surr_objs.append(surr_obj)
            """ Mean over meta tasks """
            meta_objective = tf.reduce_mean(tf.stack(surr_objs, 0))

            self.optimizer.build_graph(
                loss=meta_objective,
                target=self.policy,
                input_ph_dict=self.meta_op_phs_dict,
            )

    def optimize_policy(self, all_samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        meta_op_input_dict = self._extract_input_dict_meta_op(
            all_samples_data, self._optimization_keys)

        if log: logger.log("Optimizing")
        loss_before = self.optimizer.optimize(
            input_val_dict=meta_op_input_dict)

        if log: logger.log("Computing statistics")
        loss_after = self.optimizer.loss(input_val_dict=meta_op_input_dict)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
Ejemplo n.º 9
0
class DICEMAML(MAMLAlgo):
    """
    Algorithm for First-Order + MAML + DICE

    Args:
        max_path_length (int): maximum path length
        policy (Policy) : policy object
        name (str): tf variable scope
        learning_rate (float): learning rate for the meta-objective
        inner_lr (float) : gradient step size used for inner step
        meta_batch_size (int): number of meta-learning tasks
        num_inner_grad_steps (int) : number of gradient updates taken per maml iteration
        trainable_inner_step_size (boolean): whether make the inner step size a trainable variable
    """
    def __init__(self,
                 max_path_length,
                 *args,
                 name="dice_maml",
                 learning_rate=1e-3,
                 **kwargs):
        super(DICEMAML, self).__init__(*args, **kwargs)

        self.optimizer = MAMLFirstOrderOptimizer(learning_rate=learning_rate)
        self.max_path_length = max_path_length
        self._optimization_keys = [
            'observations', 'actions', 'adjusted_rewards', 'mask',
            'agent_infos'
        ]
        self.name = name

        self.build_graph()

    def _adapt_objective_sym(self, action_stacked_sym, adj_reward_sym,
                             mask_sym, dist_info_stacked_sym):
        with tf.variable_scope("log_likelihood"):
            log_likelihood_adapt = self.policy.distribution.log_likelihood_sym(
                action_stacked_sym, dist_info_stacked_sym)
            log_likelihood_adapt = tf.reshape(log_likelihood_adapt,
                                              tf.shape(mask_sym))
        with tf.variable_scope("dice_loss"):
            obj_adapt = -tf.reduce_mean(
                magic_box(log_likelihood_adapt) * adj_reward_sym * mask_sym)
        return obj_adapt

    def _build_inner_adaption(self):
        """
        Creates the (DICE) symbolic graph for the one-step inner gradient update (It'll be called several times if
        more gradient steps are needed)

        Args:
            some placeholders

        Returns:
            adapted_policies_params (list): list of Ordered Dict containing the symbolic post-update parameters
            adapt_input_list_ph (list): list of placeholders

        """
        obs_phs, action_phs, adj_reward_phs, mask_phs, dist_info_old_phs, adapt_input_ph_dict = self._make_dice_input_placeholders(
            'adapt')

        adapted_policies_params = []

        for i in range(self.meta_batch_size):
            with tf.variable_scope("adapt_task_%i" % i):
                with tf.variable_scope("adapt_objective"):
                    obs_stacked = self._reshape_obs_phs(obs_phs[i])
                    action_stacked = self._reshape_action_phs(action_phs[i])
                    distribution_info_stacked = self.policy.distribution_info_sym(
                        obs_stacked, params=self.policy.policies_params_phs[i])

                    # inner surrogate objective
                    adapt_loss = self._adapt_objective_sym(
                        action_stacked, adj_reward_phs[i], mask_phs[i],
                        distribution_info_stacked)

                # get tf operation for adapted (post-update) policy
                with tf.variable_scope("adapt_step"):
                    adapted_policy_param = self._adapt_sym(
                        adapt_loss, self.policy.policies_params_phs[i])
                adapted_policies_params.append(adapted_policy_param)

        return adapted_policies_params, adapt_input_ph_dict

    def build_graph(self):
        """
        Creates the computation graph for DICE MAML
        """
        """ Build graph for sampling """
        with tf.variable_scope(self.name + '_sampling'):
            self.step_sizes = self._create_step_size_vars()
            """ --- Build inner update graph for adapting the policy and sampling trajectories --- """
            # this graph is only used for adapting the policy and not computing the meta-updates
            self.adapted_policies_params, self.adapt_input_ph_dict = self._build_inner_adaption(
            )
        """ Build graph for meta-update """
        meta_update_scope = tf.variable_scope(self.name + '_meta_update')

        with meta_update_scope:
            obs_phs, action_phs, adj_reward_phs, mask_phs, dist_info_old_phs, all_phs_dict = self._make_dice_input_placeholders(
                'step0')
            self.meta_op_phs_dict = OrderedDict(all_phs_dict)

            distribution_info_vars, current_policy_params, all_surr_objs = [], [], []

        for i in range(self.meta_batch_size):
            obs_stacked = self._reshape_obs_phs(obs_phs[i])
            dist_info_sym = self.policy.distribution_info_sym(obs_stacked,
                                                              params=None)
            distribution_info_vars.append(dist_info_sym)  # step 0
            current_policy_params.append(
                self.policy.policy_params
            )  # set to real policy_params (tf.Variable)

        with meta_update_scope:
            """ Inner updates"""
            for step_id in range(1, self.num_inner_grad_steps + 1):
                with tf.variable_scope("inner_update_%i" % step_id):
                    surr_objs, adapted_policy_params = [], []

                    # inner adaptation step for each task
                    for i in range(self.meta_batch_size):
                        action_stacked = self._reshape_action_phs(
                            action_phs[i])
                        surr_loss = self._adapt_objective_sym(
                            action_stacked, adj_reward_phs[i], mask_phs[i],
                            distribution_info_vars[i])

                        adapted_params_var = self._adapt_sym(
                            surr_loss, current_policy_params[i])

                        adapted_policy_params.append(adapted_params_var)
                        surr_objs.append(surr_loss)

                    all_surr_objs.append(surr_objs)
                    # Create new placeholders for the next step
                obs_phs, action_phs, adj_reward_phs, mask_phs, dist_info_old_phs, all_phs_dict = self._make_dice_input_placeholders(
                    'step%i' % step_id)
                self.meta_op_phs_dict.update(all_phs_dict)

                # dist_info_vars_for_next_step
                distribution_info_vars = []
                for i in range(self.meta_batch_size):
                    obs_stacked = self._reshape_obs_phs(obs_phs[i])
                    distribution_info_vars.append(
                        self.policy.distribution_info_sym(
                            obs_stacked, params=adapted_policy_params[i]))

                current_policy_params = adapted_policy_params
            """ Outer (meta-)objective """
            with tf.variable_scope("outer_update"):
                surr_objs = []

                # meta-objective
                for i in range(self.meta_batch_size):
                    action_stacked = self._reshape_action_phs(action_phs[i])
                    surr_obj = self._adapt_objective_sym(
                        action_stacked, adj_reward_phs[i], mask_phs[i],
                        distribution_info_vars[i])
                    surr_objs.append(surr_obj)
                """ Mean over meta tasks """
                meta_objective = tf.reduce_mean(tf.stack(surr_objs, 0))

                self.optimizer.build_graph(
                    loss=meta_objective,
                    target=self.policy,
                    input_ph_dict=self.meta_op_phs_dict,
                )

    def optimize_policy(self, all_samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        meta_op_input_dict = self._extract_input_dict_meta_op(
            all_samples_data, self._optimization_keys)

        if log: logger.log("Optimizing")
        loss_before = self.optimizer.optimize(
            input_val_dict=meta_op_input_dict)

        if log: logger.log("Computing statistics")
        loss_after = self.optimizer.loss(input_val_dict=meta_op_input_dict)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)

    def _make_dice_input_placeholders(self, prefix=''):
        """
        In contrast to make_input_placeholders each placeholder has one dimension more with the size of self.max_path_length
        Args:
            prefix (str) : a string to prepend to the name of each variable

        Returns:
            (tuple) : a tuple containing lists of placeholders for each input type and meta task,
            and for convenience, a list containing all placeholders created
        """
        obs_phs, action_phs, adj_reward, mask_phs, dist_info_phs = [], [], [], [], []
        dist_info_specs = self.policy.distribution.dist_info_specs

        all_phs_dict = OrderedDict()

        for task_id in range(self.meta_batch_size):
            # observation ph
            ph = tf.placeholder(
                dtype=tf.float32,
                shape=[None, self.max_path_length, self.policy.obs_dim],
                name='obs' + '_' + prefix + '_' + str(task_id))
            all_phs_dict['%s_task%i_%s' %
                         (prefix, task_id, 'observations')] = ph
            obs_phs.append(ph)

            # action ph
            ph = tf.placeholder(
                dtype=tf.float32,
                shape=[None, self.max_path_length, self.policy.action_dim],
                name='action' + '_' + prefix + '_' + str(task_id))
            all_phs_dict['%s_task%i_%s' % (prefix, task_id, 'actions')] = ph
            action_phs.append(ph)

            # adjusted reward ph
            ph = tf.placeholder(dtype=tf.float32,
                                shape=[None, self.max_path_length],
                                name='adjusted_rewards' + '_' + prefix + '_' +
                                str(task_id))
            all_phs_dict['%s_task%i_%s' %
                         (prefix, task_id, 'adjusted_rewards')] = ph
            adj_reward.append(ph)

            # mask ph
            ph = tf.placeholder(dtype=tf.float32,
                                shape=[None, self.max_path_length],
                                name='mask' + '_' + prefix + '_' +
                                str(task_id))
            all_phs_dict['%s_task%i_%s' % (prefix, task_id, 'mask')] = ph
            mask_phs.append(ph)

            # distribution / agent info
            dist_info_ph_dict = {}
            for info_key, shape in dist_info_specs:
                ph = tf.placeholder(
                    dtype=tf.float32,
                    shape=[None, self.max_path_length] + list(shape),
                    name='%s_%s_%i' % (info_key, prefix, task_id))
                all_phs_dict['%s_task%i_agent_infos/%s' %
                             (prefix, task_id, info_key)] = ph
                dist_info_ph_dict[info_key] = ph
            dist_info_phs.append(dist_info_ph_dict)

        return obs_phs, action_phs, adj_reward, mask_phs, dist_info_phs, all_phs_dict

    def _reshape_obs_phs(self, obs_sym):
        # reshape from 3-D tensor of shape (num_paths, max_path_length, ndim_obs) to (num_paths * max_path_length, ndim_obs)
        return tf.reshape(obs_sym, [-1, self.policy.obs_dim])

    def _reshape_action_phs(self, action_sym):
        # reshape from 3-D tensor of shape (num_paths, max_path_length, ndim_act) to (num_paths * max_path_length, ndim_act)
        return tf.reshape(action_sym, [-1, self.policy.action_dim])
Ejemplo n.º 10
0
class VPGMAML(MAMLAlgo):
    """
    Algorithm for PPO MAML

    Args:
        policy (Policy): policy object
        name (str): tf variable scope
        learning_rate (float): learning rate for the meta-objective
        inner_type (str): inner optimization objective - either log_likelihood or likelihood_ratio
        inner_lr (float) : gradient step size used for inner step
        meta_batch_size (int): number of meta-learning tasks
        num_inner_grad_steps (int) : number of gradient updates taken per maml iteration
        trainable_inner_step_size (boolean): whether make the inner step size a trainable variable
    """
    def __init__(self,
                 *args,
                 name="vpg_maml",
                 learning_rate=1e-3,
                 inner_type='likelihood_ratio',
                 exploration=False,
                 **kwargs):
        super(VPGMAML, self).__init__(*args, **kwargs)
        assert inner_type in ["log_likelihood", "likelihood_ratio"]

        self.optimizer = MAMLFirstOrderOptimizer(learning_rate=learning_rate)
        self.inner_type = inner_type
        self._optimization_keys = [
            'observations', 'actions', 'advantages', 'agent_infos'
        ]
        self.name = name
        self.exploration = exploration

        self.build_graph()

    def _adapt_objective_sym(self, action_sym, adv_sym, dist_info_old_sym,
                             dist_info_new_sym):
        if self.inner_type == 'likelihood_ratio':
            with tf.variable_scope("likelihood_ratio"):
                likelihood_ratio_adapt = self.policy.distribution.likelihood_ratio_sym(
                    action_sym, dist_info_old_sym, dist_info_new_sym)
            with tf.variable_scope("surrogate_loss"):
                surr_obj_adapt = -tf.reduce_mean(
                    likelihood_ratio_adapt * adv_sym)

        elif self.inner_type == 'log_likelihood':
            with tf.variable_scope("log_likelihood"):
                log_likelihood_adapt = self.policy.distribution.log_likelihood_sym(
                    action_sym, dist_info_new_sym)
            with tf.variable_scope("surrogate_loss"):
                surr_obj_adapt = -tf.reduce_mean(
                    log_likelihood_adapt * adv_sym)

        else:
            raise NotImplementedError

        return surr_obj_adapt

    def build_graph(self):
        """
        Creates the computation graph
        """

        self.gradients = []
        """ Create Variables """
        with tf.variable_scope(self.name):
            self.step_sizes = self._create_step_size_vars()
            """ --- Build inner update graph for adapting the policy and sampling trajectories --- """
            # this graph is only used for adapting the policy and not computing the meta-updates
            self.adapted_policies_params, self.adapt_input_ph_dict = self._build_inner_adaption(
            )
            """ ----- Build graph for the meta-update ----- """
            self.meta_op_phs_dict = OrderedDict()
            obs_phs, action_phs, adv_phs, dist_info_old_phs, all_phs_dict = self._make_input_placeholders(
                'step0')
            self.meta_op_phs_dict.update(all_phs_dict)

            distribution_info_vars, current_policy_params = [], []
            all_surr_objs = []

        for i in range(self.meta_batch_size):
            dist_info_sym = self.policy.distribution_info_sym(obs_phs[i],
                                                              params=None)
            distribution_info_vars.append(dist_info_sym)  # step 0
            current_policy_params.append(
                self.policy.policy_params
            )  # set to real policy_params (tf.Variable)

        initial_distribution_info_vars = distribution_info_vars
        initial_action_phs = action_phs

        with tf.variable_scope(self.name):
            """ Inner updates"""
            for step_id in range(1, self.num_inner_grad_steps + 1):
                surr_objs, adapted_policy_params, gradient_vectors = [], [], []

                # inner adaptation step for each task
                for i in range(self.meta_batch_size):
                    surr_loss = self._adapt_objective_sym(
                        action_phs[i], adv_phs[i], dist_info_old_phs[i],
                        distribution_info_vars[i])

                    adapted_params_var, gradient_vector = self._adapt_sym(
                        surr_loss, current_policy_params[i])
                    gradient_vectors.append(gradient_vector)

                    adapted_policy_params.append(adapted_params_var)
                    surr_objs.append(surr_loss)

                all_surr_objs.append(surr_objs)
                # Create new placeholders for the next step
                obs_phs, action_phs, adv_phs, dist_info_old_phs, all_phs_dict = self._make_input_placeholders(
                    'step%i' % step_id)
                self.meta_op_phs_dict.update(all_phs_dict)

                # dist_info_vars_for_next_step
                distribution_info_vars = [
                    self.policy.distribution_info_sym(
                        obs_phs[i], params=adapted_policy_params[i])
                    for i in range(self.meta_batch_size)
                ]
                current_policy_params = adapted_policy_params
                self.gradients.append(gradient_vectors)
            """ Outer objective """
            surr_objs = []

            # meta-objective
            for i in range(self.meta_batch_size):
                log_likelihood = self.policy.distribution.log_likelihood_sym(
                    action_phs[i], distribution_info_vars[i])
                surr_obj = -tf.reduce_mean(log_likelihood * adv_phs[i])
                surr_objs.append(surr_obj)

                if self.exploration:
                    log_likelihood_inital = self.policy.distribution.log_likelihood_sym(
                        initial_action_phs[i],
                        initial_distribution_info_vars[i])
                    surr_obj += -tf.reduce_mean(
                        adv_phs[i]) * tf.reduce_sum(log_likelihood_inital)
            """ Mean over meta tasks """
            meta_objective = tf.reduce_mean(tf.stack(surr_objs, 0))

            # get meta gradients
            params_var = self.policy.get_params()
            meta_gradients = tf.gradients(
                meta_objective,
                [params_var[key] for key in sorted(params_var.keys())])
            meta_gradients = tf.concat(
                [tf.reshape(grad, shape=(-1, )) for grad in meta_gradients],
                axis=0)  # flatten and concatenate
            self.gradients.append(meta_gradients)

            self.optimizer.build_graph(
                loss=meta_objective,
                target=self.policy,
                input_ph_dict=self.meta_op_phs_dict,
            )

    def optimize_policy(self, all_samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        meta_op_input_dict = self._extract_input_dict_meta_op(
            all_samples_data, self._optimization_keys)

        if log: logger.log("Optimizing")
        loss_before = self.optimizer.optimize(
            input_val_dict=meta_op_input_dict)

        if log: logger.log("Computing statistics")
        loss_after = self.optimizer.loss(input_val_dict=meta_op_input_dict)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)

    def compute_gradients(self, all_samples_data, log=True):
        meta_op_input_dict = self._extract_input_dict_meta_op(
            all_samples_data, self._optimization_keys)
        feed_dict = utils.create_feed_dict(
            placeholder_dict=self.meta_op_phs_dict,
            value_dict=meta_op_input_dict)
        if log: logger.log("compute gradients")
        gradients_values = tf.get_default_session().run(self.gradients,
                                                        feed_dict=feed_dict)
        return gradients_values

    def _build_inner_adaption(self):
        """
        Creates the symbolic graph for the one-step inner gradient update (It'll be called several times if
        more gradient steps are needed)

        Args:
            some placeholders

        Returns:
            adapted_policies_params (list): list of Ordered Dict containing the symbolic post-update parameters
            adapt_input_list_ph (list): list of placeholders

        """
        obs_phs, action_phs, adv_phs, dist_info_old_phs, adapt_input_ph_dict = self._make_input_placeholders(
            'adapt')

        adapted_policies_params = []

        for i in range(self.meta_batch_size):
            with tf.variable_scope("adapt_task_%i" % i):
                with tf.variable_scope("adapt_objective"):
                    distribution_info_new = self.policy.distribution_info_sym(
                        obs_phs[i], params=self.policy.policies_params_phs[i])

                    # inner surrogate objective
                    surr_obj_adapt = self._adapt_objective_sym(
                        action_phs[i], adv_phs[i], dist_info_old_phs[i],
                        distribution_info_new)

                # get tf operation for adapted (post-update) policy
                with tf.variable_scope("adapt_step"):
                    adapted_policy_param, _ = self._adapt_sym(
                        surr_obj_adapt, self.policy.policies_params_phs[i])
                adapted_policies_params.append(adapted_policy_param)

        return adapted_policies_params, adapt_input_ph_dict

    def _adapt_sym(self, surr_obj, params_var):
        """
        Creates the symbolic representation of the tf policy after one gradient step towards the surr_obj

        Args:
            surr_obj (tf_op) : tensorflow op for task specific (inner) objective
            params_var (dict) : dict of tf.Tensors for current policy params

        Returns:
            (dict):  dict of tf.Tensors for adapted policy params
        """
        # TODO: Fix this if we want to learn the learning rate (it isn't supported right now).
        update_param_keys = list(params_var.keys())

        grads = tf.gradients(surr_obj,
                             [params_var[key] for key in update_param_keys])
        gradients = dict(zip(update_param_keys, grads))

        # gradient descent
        adapted_policy_params = [
            params_var[key] - tf.multiply(self.step_sizes[key], gradients[key])
            for key in update_param_keys
        ]

        adapted_policy_params_dict = OrderedDict(
            zip(update_param_keys, adapted_policy_params))

        # flattens and concatenates the gadients
        gradient_vector = tf.concat(
            [tf.reshape(grad, shape=(-1, )) for grad in grads], axis=0)
        return adapted_policy_params_dict, gradient_vector
Ejemplo n.º 11
0
    def testGauss(self):
        np.random.seed(65)
        for optimizer in [MAMLFirstOrderOptimizer()]:
            tf.reset_default_graph()
            with tf.Session():
                input_phs = tf.placeholder(dtype=tf.float32, shape=[None, 100])
                target_mean_ph = tf.placeholder(dtype=tf.float32,
                                                shape=[None, 1])
                target_std_ph = tf.placeholder(dtype=tf.float32,
                                               shape=[None, 1])

                mean_network = Mlp(input_phs,
                                   1,
                                   hidden_size=(8, 8),
                                   name='mean')
                std_network = Mlp(input_phs, 1, hidden_size=(8, 8), name='std')

                target_std = tf.exp(target_std_ph)
                pred_std = tf.exp(std_network.output)

                numerator = tf.square(target_mean_ph -
                                      mean_network.output) + tf.square(
                                          target_std) - tf.square(pred_std)
                denominator = 2 * tf.square(pred_std) + 1e-8
                loss = tf.reduce_mean(
                    tf.reduce_sum(numerator / denominator +
                                  std_network.output - target_std_ph,
                                  axis=-1))

                joined_network = CombinedMlp([mean_network, std_network])
                input_ph_dict = OrderedDict({
                    'x': input_phs,
                    'y_mean': target_mean_ph,
                    'y_std': target_std_ph
                })

                optimizer.build_graph(loss, joined_network, input_ph_dict)

                sess = tf.get_default_session()
                sess.run(tf.global_variables_initializer())

                for i in range(2000):
                    means = np.random.random(size=(1000))
                    stds = np.random.random(size=(1000))
                    inputs = np.vstack([
                        np.random.normal(mean, np.exp(std), 100)
                        for mean, std in zip(means, stds)
                    ])
                    all_inputs = {
                        'x': inputs,
                        'y_mean': means.reshape(-1, 1),
                        'y_std': stds.reshape(-1, 1)
                    }
                    optimizer.optimize(all_inputs)
                    if i % 100 == 0:
                        print(optimizer.loss(all_inputs))

                means = np.random.random(size=(20))
                stds = np.random.random(size=(20))

                inputs = np.stack([
                    np.random.normal(mean, np.exp(std), 100)
                    for mean, std in zip(means, stds)
                ],
                                  axis=0)
                values_dict = OrderedDict({
                    'x': inputs,
                    'y_mean': means.reshape(-1, 1),
                    'y_std': stds.reshape(-1, 1)
                })

                mean_pred, std_pred = sess.run(
                    joined_network.output,
                    feed_dict=dict(
                        list(zip(input_ph_dict.values(),
                                 values_dict.values()))))

                self.assertTrue(np.mean(np.square(mean_pred - means)) < 0.2)
                self.assertTrue(np.mean(np.square(std_pred - stds)) < 0.2)