Exemple #1
0
    def __init__(self,
                 obs_dim,
                 act_dim,
                 *,
                 seed,
                 norm,
                 model_weights,
                 target_temp,
                 med_dist,
                 hidden_layers_q,
                 activation_q,
                 hidden_layers_w=[32, 32],
                 scope='mwl',
                 lr=5e-3,
                 reg_factor=0,
                 gamma=0.999):
        super().__init__(scope)

        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.gamma = gamma
        self.lr = lr
        self.reg_factor = reg_factor
        self.hidden_layers_w = hidden_layers_w
        self.hidden_layers_q = hidden_layers_q
        # import pdb; pdb.set_trace()
        self.act_encoded = np.linspace(-1, 1, act_dim)

        self.med_dist = med_dist
        self.seed = seed
        self.norm = norm
        assert self.norm[
            'type'] is not None, 'data should already be processed before calling the algorithm'

        self._session = tf.compat.v1.Session()
        self._session.__enter__()
        with self._session.as_default():
            self.q_net = Q_Model_Tf(obs_dim, act_dim, hidden_layers = hidden_layers_q, temperature=target_temp, seed = self.seed,\
                activation = activation_q)

        #XXX debug
        self.debug_w = {}

        self.trainable_vars = []

        self.build_graph()
        self.build_estimation_graph()
        self.create_loss_func()

        self._session.run(
            [tf.compat.v1.variables_initializer(self.trainable_vars)])
        self.q_net.load_weight(model_weights)
Exemple #2
0
    def __init__(self,
                 obs_dim,
                 act_dim,
                 *,
                 seed,
                 norm,
                 model_weights,
                 target_temp,
                 med_dist,
                 hidden_layers_p,
                 activation_p,
                 hidden_layers=[32, 32],
                 scope='mql',
                 lr=5e-3,
                 reg_factor=0,
                 gamma=0.999):
        super().__init__(scope)

        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.gamma = gamma
        self.lr = lr
        self.reg_factor = reg_factor
        self.hidden_layers = hidden_layers
        self.seed = seed

        self.med_dist = med_dist
        # self.q_net = q_net
        self.norm = norm

        self.act_encoded = np.linspace(-1, 1, act_dim)

        #XXX debug
        self.debug_q = {}

        self.trainable_vars = []
        self._session = tf.compat.v1.Session()
        self._session.__enter__()
        with self._session.as_default():
            self.q_net = Q_Model_Tf(obs_dim, act_dim, hidden_layers = hidden_layers_p, temperature=target_temp, seed = self.seed,\
                activation = activation_p)

        self.build_graph()
        self.build_estimation_graph()
        self.create_loss_func()

        self._session.run(
            [tf.compat.v1.variables_initializer(self.trainable_vars)])
        self.q_net.load_weight(model_weights)
    def __init__(self,
                             parameters, target_policy_config,
                             solve_for_state_action_ratio = True,
                             average_next_nu = True,
                             average_samples = 1,
                             function_exponent = 1.5):
        """Initializes the solver.

        Args:
            parameters: An object holding the common neural network parameters.
            solve_for_state_action_ratio: Whether to solve for state-action density
                ratio. Defaults to True, which is recommended, since solving for the
                state density ratio requires importance weights which can introduce
                training instability.
            average_next_nu: Whether to take an empirical expectation over next nu.
                This can improve stability of training.
            average_samples: Number of empirical samples to average over for next nu
                computation (only relevant in continuous environments).
            function_exponent: The form of the function f(x). We use a polynomial
                f(x)=|x|^p / p where p is function_exponent.

        Raises:
            ValueError: If function_exponent is less than or equal to 1.
            NotImplementedError: If actions are continuous.
        """
        self._parameters = parameters
        self._solve_for_state_action_ratio = solve_for_state_action_ratio
        self._average_next_nu = average_next_nu
        self._average_samples = average_samples

        if not self._parameters.discrete_actions:
            raise NotImplementedError('Continuous actions are not fully supported.')

        if function_exponent <= 1:
            raise ValueError('Exponent for f must be at least 1.')

        # Conjugate of f(x) = |x|^p / p is f*(x) = |x|^q / q where q = p / (p - 1).
        conjugate_exponent = function_exponent / (function_exponent - 1)
        self._f = lambda x: tf.abs(x) ** function_exponent / function_exponent
        self._fstar = lambda x: tf.abs(x) ** conjugate_exponent / conjugate_exponent

        # Build and initialize graph.
        self._build_graph()
        self._session = tf.compat.v1.Session()
        self._session.__enter__()
        obs_dim = target_policy_config['obs_dim']
        act_dim = target_policy_config['act_dim']
        hidden_layers_p = target_policy_config['hidden_layers_p']
        target_temp = target_policy_config['target_temp']
        seed = target_policy_config['seed']
        activation_p = target_policy_config['activation_p']
        model_weights = target_policy_config['model_weights']

        with self._session.as_default():
            self.target_policy = Q_Model_Tf(obs_dim, act_dim, hidden_layers = hidden_layers_p, temperature=target_temp, seed = seed,\
                activation = activation_p)
            

            self._session.run(tf.compat.v1.global_variables_initializer())
            
            self.target_policy.load_weight(model_weights)
class NeuralDualDice(base_algo.BaseAlgo):
    """Approximate the density ratio using neural networks."""

    def __init__(self,
                             parameters, target_policy_config,
                             solve_for_state_action_ratio = True,
                             average_next_nu = True,
                             average_samples = 1,
                             function_exponent = 1.5):
        """Initializes the solver.

        Args:
            parameters: An object holding the common neural network parameters.
            solve_for_state_action_ratio: Whether to solve for state-action density
                ratio. Defaults to True, which is recommended, since solving for the
                state density ratio requires importance weights which can introduce
                training instability.
            average_next_nu: Whether to take an empirical expectation over next nu.
                This can improve stability of training.
            average_samples: Number of empirical samples to average over for next nu
                computation (only relevant in continuous environments).
            function_exponent: The form of the function f(x). We use a polynomial
                f(x)=|x|^p / p where p is function_exponent.

        Raises:
            ValueError: If function_exponent is less than or equal to 1.
            NotImplementedError: If actions are continuous.
        """
        self._parameters = parameters
        self._solve_for_state_action_ratio = solve_for_state_action_ratio
        self._average_next_nu = average_next_nu
        self._average_samples = average_samples

        if not self._parameters.discrete_actions:
            raise NotImplementedError('Continuous actions are not fully supported.')

        if function_exponent <= 1:
            raise ValueError('Exponent for f must be at least 1.')

        # Conjugate of f(x) = |x|^p / p is f*(x) = |x|^q / q where q = p / (p - 1).
        conjugate_exponent = function_exponent / (function_exponent - 1)
        self._f = lambda x: tf.abs(x) ** function_exponent / function_exponent
        self._fstar = lambda x: tf.abs(x) ** conjugate_exponent / conjugate_exponent

        # Build and initialize graph.
        self._build_graph()
        self._session = tf.compat.v1.Session()
        self._session.__enter__()
        obs_dim = target_policy_config['obs_dim']
        act_dim = target_policy_config['act_dim']
        hidden_layers_p = target_policy_config['hidden_layers_p']
        target_temp = target_policy_config['target_temp']
        seed = target_policy_config['seed']
        activation_p = target_policy_config['activation_p']
        model_weights = target_policy_config['model_weights']

        with self._session.as_default():
            self.target_policy = Q_Model_Tf(obs_dim, act_dim, hidden_layers = hidden_layers_p, temperature=target_temp, seed = seed,\
                activation = activation_p)
            

            self._session.run(tf.compat.v1.global_variables_initializer())
            
            self.target_policy.load_weight(model_weights)

    def _build_graph(self):
        self._create_placeholders()

        # Convert discrete actions to one-hot vectors.
        action = tf.one_hot(self._action, self._parameters.action_dim)
        next_action = tf.one_hot(self._next_action, self._parameters.action_dim)
        initial_action = tf.one_hot(self._initial_action,
                                                                self._parameters.action_dim)

        nu, next_nu, initial_nu, zeta = self._compute_values(
                action, next_action, initial_action)

        # Density ratio given by approximated zeta values.
        self._density_ratio = zeta

        delta_nu = nu - next_nu * self._parameters.gamma

        unweighted_zeta_loss = (delta_nu * zeta - self._fstar(zeta) -
                                                        (1 - self._parameters.gamma) * initial_nu)
        self._zeta_loss = -(tf.reduce_sum(input_tensor=self._weights * unweighted_zeta_loss) /
                                                tf.reduce_sum(input_tensor=self._weights))

        # TODO, deterministic env ???
        if self._parameters.deterministic_env and self._average_next_nu:
            # Dont use Fenchel conjugate trick and instead optimize primal.
            unweighted_nu_loss = (self._f(delta_nu) -
                                                        (1 - self._parameters.gamma) * initial_nu)
            self._nu_loss = (tf.reduce_sum(input_tensor=self._weights * unweighted_nu_loss) /
                                             tf.reduce_sum(input_tensor=self._weights))
        else:
            self._nu_loss = -self._zeta_loss

        self._train_nu_op = tf.compat.v1.train.AdamOptimizer(
                self._parameters.nu_learning_rate).minimize(
                        self._nu_loss, var_list=tf.compat.v1.trainable_variables('nu'))
        self._train_zeta_op = tf.compat.v1.train.AdamOptimizer(
                self._parameters.zeta_learning_rate).minimize(
                        self._zeta_loss, var_list=tf.compat.v1.trainable_variables('zeta'))
        self._train_op = tf.group(self._train_nu_op, self._train_zeta_op)

        # Debug quantity (should be close to 1).
        self._debug = (
                tf.reduce_sum(input_tensor=self._weights * self._density_ratio) /
                tf.reduce_sum(input_tensor=self._weights))

    def _create_placeholders(self):
        self._state = tf.compat.v1.placeholder(
                tf.float32, [None, self._parameters.state_dim], 'state')
        self._next_state = tf.compat.v1.placeholder(
                tf.float32, [None, self._parameters.state_dim], 'next_state')
        self._initial_state = tf.compat.v1.placeholder(
                tf.float32, [None, self._parameters.state_dim], 'initial_state')

        self._action = tf.compat.v1.placeholder(tf.int32, [None], 'action')
        self._next_action = tf.compat.v1.placeholder(tf.int32, [None], 'next_action')
        self._initial_action = tf.compat.v1.placeholder(tf.int32, [None], 'initial_action')


        # Policy sampling probabilities associated with next state.
        self._target_policy_next_probs = tf.compat.v1.placeholder(
                tf.float32, [None, self._parameters.action_dim])

        self._weights = tf.compat.v1.placeholder(tf.float32, [None], 'weights')

    def _compute_values(self, action, next_action, initial_action):
        self.act_w = 1.0
        nu = self._nu_network(self._state, action)
        initial_nu = self._nu_network(self._initial_state, initial_action)

        if self._average_next_nu:
            # Average next nu over all actions weighted by target policy
            # probabilities.
            all_next_actions = [
                    tf.one_hot(act * tf.ones_like(self._next_action),
                                         self._parameters.action_dim)
                    for act in range(self._parameters.action_dim)]
            all_next_nu = [self._nu_network(self._next_state, next_action_i)
                                         for next_action_i in all_next_actions]
            next_nu = sum(
                    self._target_policy_next_probs[:, act_index] * all_next_nu[act_index]
                    for act_index in range(self._parameters.action_dim))
        else:
            next_nu = self._nu_network(self._next_state, next_action)

        zeta = self._zeta_network(self._state, action)

        return nu, next_nu, initial_nu, zeta

    def _nu_network(self, state, action):
        with tf.compat.v1.variable_scope('nu', reuse=tf.compat.v1.AUTO_REUSE):
            inputs = tf.concat([state, action * self.act_w], -1)        
            outputs = self._network(inputs)
        return outputs

    def _zeta_network(self, state, action, act_w=1.0):
        with tf.compat.v1.variable_scope('zeta', reuse=tf.compat.v1.AUTO_REUSE):
            inputs = tf.concat([state, action * self.act_w], -1)
            outputs = self._network(inputs)
        return outputs

    def _network(self, inputs):
        with tf.compat.v1.variable_scope('network', reuse=tf.compat.v1.AUTO_REUSE):
            input_dim = int(inputs.shape[-1])
            prev_dim = input_dim
            prev_outputs = inputs
            # Hidden layers.
            #XXX, for fair comparison, change the network build method? (tf.layers.dense???)
            for layer in range(self._parameters.hidden_layers):
                with tf.compat.v1.variable_scope('layer%d' % layer, reuse=tf.compat.v1.AUTO_REUSE):
                    weight = tf.compat.v1.get_variable(
                            'weight', [prev_dim, self._parameters.hidden_dim],
                            initializer=tf.compat.v1.glorot_uniform_initializer())
                    bias = tf.compat.v1.get_variable(
                            'bias', initializer=tf.zeros([self._parameters.hidden_dim]))
                    pre_activation = tf.matmul(prev_outputs, weight) + bias
                    post_activation = self._parameters.activation(pre_activation)
                prev_dim = self._parameters.hidden_dim
                prev_outputs = post_activation

            # Final layer.
            weight = tf.compat.v1.get_variable(
                    'weight_final', [prev_dim, 1],
                    initializer=tf.compat.v1.glorot_uniform_initializer())
            bias = tf.compat.v1.get_variable(
                    'bias_final', [1], initializer=tf.compat.v1.zeros_initializer())
            output = tf.matmul(prev_outputs, weight) + bias
            return output[Ellipsis, 0]

    def _sample_data(self, dataset, sample_num):
            data_size = dataset['obs'].shape[0]
            init_size = dataset['init_obs'].shape[0]
            
            index = np.random.choice(data_size, sample_num)
            init_index = np.random.choice(init_size, sample_num)

            return {
                    'obs': dataset['obs'][index],
                    'acts': dataset['acts'][index],
                    'next_obs': dataset['next_obs'][index],
                    'next_acts': dataset['next_acts'][index],
                    'init_obs': dataset['init_obs'][init_index],
                    'time_step': dataset['time_step'][index],
                    'target_prob_next_obs': dataset['target_prob_next_obs'][index]
            }

    # def solve(self, dataset, target_policy, norm, logger):
    def solve(self, dataset, norm, logger):
        """Solves for density ratios and then approximates target policy value.

        Args:
            dataset: The transition data store to use.
            target_policy: The policy whose value we want to estimate.
            baseline_policy: The policy used to collect the data. If None,
                we default to data.policy.

        Returns:
            Estimated average per-step reward of the target policy.

        Raises:
            ValueError: If NaNs encountered in policy ratio computation.
        """

        value_estimates = []
        for step in range(self._parameters.num_steps):
            batch = self._sample_data(dataset, self._parameters.batch_size)
            feed_dict = {
                    self._state: batch['obs'],
                    self._action: batch['acts'],
                    self._next_state: batch['next_obs'],
                    self._initial_state: batch['init_obs'],
                    self._weights: self._parameters.gamma ** batch['time_step'],
            }
            
            # On-policy next action and initial action.
            feed_dict[self._next_action] = self.target_policy.sample_action(
                    batch['next_obs'], norm)
            feed_dict[self._initial_action] = self.target_policy.sample_action(
                    batch['init_obs'], norm)
            
            if self._average_next_nu:
                # next_probabilities = self.target_policy.get_probabilities(batch['next_obs'], norm)
                # feed_dict[self._target_policy_next_probs] = next_probabilities
                #* replace the above with calling from the data
                feed_dict[self._target_policy_next_probs] = batch['target_prob_next_obs']
            self._session.run(self._train_op, feed_dict=feed_dict)

            if step % self._parameters.log_every == 0:
                debug = self._session.run(self._debug, feed_dict=feed_dict)
                value_estimate = self.estimate_average_reward(dataset)
                value_estimates.append(value_estimate)
                # if step % 1000 == 0:
                #     print('Iter: {}. DualDICE Estimate: {:.2f}'.format(step, np.mean(value_estimates[-self._parameters.smooth_over:])))

                # logger.info('At step %d' % step)
                # logger.info('Debug: %s' % debug)
                # value_estimate = self.estimate_average_reward(dataset)
                # logger.info('Estimated value: %s' % value_estimate)
                # value_estimates.append(value_estimate)
                # logger.info(
                #         'Estimated smoothed value: %s' %
                #         np.mean(value_estimates[-self._parameters.smooth_over:]))

                # if self._parameters.summary_writer:
                #     summary = tf.compat.v1.Summary(value=[
                #             tf.compat.v1.Summary.Value(
                #                     tag='%sdebug' % self._parameters.summary_prefix,
                #                     simple_value=debug),
                #             tf.compat.v1.Summary.Value(
                #                     tag='%svalue_estimate' % self._parameters.summary_prefix,
                #                     simple_value=value_estimate)])
                #     self._parameters.summary_writer.add_summary(summary, step)

                # logger.add(step, value_estimate, np.mean(value_estimates[-self._parameters.smooth_over:]))
                # logger.dump()


            # if step % self._parameters.log_every == 0 and (logger is not None):
            #     debug = self._session.run(self._debug, feed_dict=feed_dict)
            #     # tf.logging.info('At step %d' % step)
            #     # tf.logging.info('Debug: %s' % debug)
            #     # value_estimate = self.estimate_average_reward(dataset)
            #     # tf.logging.info('Estimated value: %s' % value_estimate)
            #     # value_estimates.append(value_estimate)
            #     # tf.logging.info(
            #     #     'Estimated smoothed value: %s' %
            #     #     np.mean(value_estimates[-self._parameters.smooth_over:]))

            #     logger.info('At step %d' % step)
            #     logger.info('Debug: %s' % debug)
            #     value_estimate = self.estimate_average_reward(dataset)
            #     logger.info('Estimated value: %s' % value_estimate)
            #     value_estimates.append(value_estimate)
            #     logger.info(
            #             'Estimated smoothed value: %s' %
            #             np.mean(value_estimates[-self._parameters.smooth_over:]))

            #     if self._parameters.summary_writer:
            #         summary = tf.compat.v1.Summary(value=[
            #                 tf.compat.v1.Summary.Value(
            #                         tag='%sdebug' % self._parameters.summary_prefix,
            #                         simple_value=debug),
            #                 tf.compat.v1.Summary.Value(
            #                         tag='%svalue_estimate' % self._parameters.summary_prefix,
            #                         simple_value=value_estimate)])
            #         self._parameters.summary_writer.add_summary(summary, step)

            #     logger.add(step, value_estimate, np.mean(value_estimates[-self._parameters.smooth_over:]))
            #     logger.dump()

        value_estimate = self.estimate_average_reward(dataset)
        tf.compat.v1.logging.info('Estimated value: %s' % value_estimate)
        value_estimates.append(value_estimate)
        tf.compat.v1.logging.info('Estimated smoothed value: %s' %
                                        np.mean(value_estimates[-self._parameters.smooth_over:]))
        if logger is not None:
            logger.add(step, value_estimate, np.mean(value_estimates[-self._parameters.smooth_over:]))
            logger.dump()

        # Return estimate that is smoothed over last few iterates.
        return np.mean(value_estimates[-self._parameters.smooth_over:])

    def _state_action_density_ratio(self, state, action):
        batched = len(np.shape(state)) > 1
        if not batched:
            state = np.expand_dims(state, 0)
            action = np.expand_dims(action, 0)
        density_ratio = self._session.run(
                self._density_ratio,
                feed_dict={
                        self._state: state,
                        self._action: action
                })
        if not batched:
            return density_ratio[0]
        return density_ratio

    def estimate_average_reward(self, dataset):
        """Estimates value (average per-step reward) of policy.

        The estimation is based on solved values of zeta, so one should call
        solve() before calling this function.

        Returns:
            Estimated average per-step reward of the target policy.
        """
        return estimate_value_from_state_action_ratios(
                dataset, self._parameters.gamma, self._state_action_density_ratio)

    def close(self):
        self._session.close()
Exemple #5
0
class MQL(Basic_Alg):
    def __init__(self,
                 obs_dim,
                 act_dim,
                 *,
                 seed,
                 norm,
                 model_weights,
                 target_temp,
                 med_dist,
                 hidden_layers_p,
                 activation_p,
                 hidden_layers=[32, 32],
                 scope='mql',
                 lr=5e-3,
                 reg_factor=0,
                 gamma=0.999):
        super().__init__(scope)

        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.gamma = gamma
        self.lr = lr
        self.reg_factor = reg_factor
        self.hidden_layers = hidden_layers
        self.seed = seed

        self.med_dist = med_dist
        # self.q_net = q_net
        self.norm = norm

        #XXX debug
        self.debug_q = {}

        self.trainable_vars = []
        self._session = tf.compat.v1.Session()
        self._session.__enter__()
        with self._session.as_default():
            self.q_net = Q_Model_Tf(obs_dim, act_dim, hidden_layers = hidden_layers_p, temperature=target_temp, seed = self.seed,\
                activation = activation_p)

        self.build_graph()
        self.build_estimation_graph()
        self.create_loss_func()

        self._session.run(
            [tf.compat.v1.variables_initializer(self.trainable_vars)])
        self.q_net.load_weight(model_weights)
        # self._session.run(tf.global_variables_initializer())
        # tf.get_default_session().run(
        #     [tf.variables_initializer(self.trainable_vars)]
        # )

    def build_graph(self):
        ''' firs sample '''
        self.rew_ph = tf.compat.v1.placeholder(dtype=tf.float32,
                                               shape=[None, 1])
        self.obs_ph = tf.compat.v1.placeholder(dtype=tf.float32,
                                               shape=[None, self.obs_dim])
        self.act_ph = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1])
        self.obs_act = tf.concat(
            [self.obs_ph, tf.cast(self.act_ph, tf.float32)], axis=1)
        self.q = self.create_value_func(self.obs_ph,
                                        self.act_ph,
                                        func_type='q',
                                        reuse=False)

        self.next_obs_ph = tf.compat.v1.placeholder(dtype=tf.float32,
                                                    shape=[None, self.obs_dim])
        self.v_next = self.create_value_func(self.next_obs_ph,
                                             None,
                                             func_type='v',
                                             reuse=True)
        ''' second sample '''
        self.rew_ph_2 = tf.compat.v1.placeholder(dtype=tf.float32,
                                                 shape=[None, 1])
        self.obs_ph_2 = tf.compat.v1.placeholder(dtype=tf.float32,
                                                 shape=[None, self.obs_dim])
        self.act_ph_2 = tf.compat.v1.placeholder(dtype=tf.int32,
                                                 shape=[None, 1])
        self.obs_act_2 = tf.concat(
            [self.obs_ph_2, tf.cast(self.act_ph_2, tf.float32)], axis=1)
        self.q_2 = self.create_value_func(self.obs_ph_2,
                                          self.act_ph_2,
                                          func_type='q',
                                          reuse=True)

        self.next_obs_ph_2 = tf.compat.v1.placeholder(
            dtype=tf.float32, shape=[None, self.obs_dim])
        self.v_next_2 = self.create_value_func(self.next_obs_ph_2,
                                               None,
                                               func_type='v',
                                               reuse=True)

    def create_value_func(self,
                          obs_tf,
                          act_tf,
                          *,
                          func_type,
                          reuse=False,
                          normalize=True):
        if func_type == 'v':
            if self.norm['type'] is not None:
                org_obs = obs_tf * self.norm['scale'] + self.norm['shift']
            else:
                org_obs = obs_tf
            prob_mask = self.q_net.build_prob(org_obs, split=True)

            with tf.compat.v1.variable_scope(self.scope, reuse=reuse):
                x = tf.concat(
                    [obs_tf, tf.zeros([tf.shape(input=obs_tf)[0], 1])], axis=1)
                for h in self.hidden_layers:
                    x = tf.compat.v1.layers.dense(x, h, activation=tf.nn.relu)
                q0 = tf.compat.v1.layers.dense(
                    x,
                    1,
                    activation=None,
                    kernel_regularizer=tf.keras.regularizers.l2(0.5 * (1.)),
                    bias_regularizer=tf.keras.regularizers.l2(0.5 * (1.)))

            with tf.compat.v1.variable_scope(self.scope, reuse=True):
                x = tf.concat(
                    [obs_tf, tf.ones([tf.shape(input=obs_tf)[0], 1])], axis=1)
                for h in self.hidden_layers:
                    x = tf.compat.v1.layers.dense(x, h, activation=tf.nn.relu)
                q1 = tf.compat.v1.layers.dense(
                    x,
                    1,
                    activation=None,
                    kernel_regularizer=tf.keras.regularizers.l2(0.5 * (1.)),
                    bias_regularizer=tf.keras.regularizers.l2(0.5 * (1.)))
            value = q0 * prob_mask[0] + q1 * prob_mask[1]
        else:

            with tf.compat.v1.variable_scope(self.scope, reuse=reuse):
                x = tf.concat([obs_tf, tf.cast(act_tf, tf.float32)], axis=1)
                for h in self.hidden_layers:
                    x = tf.compat.v1.layers.dense(x, h, activation=tf.nn.relu)
                q = tf.compat.v1.layers.dense(
                    x,
                    1,
                    activation=None,
                    kernel_regularizer=tf.keras.regularizers.l2(0.5 * (1.)),
                    bias_regularizer=tf.keras.regularizers.l2(0.5 * (1.)))

                if reuse == False:
                    self.trainable_vars += tf.compat.v1.get_collection(
                        tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES,
                        scope=self.scope)
                value = q
        return value

    def build_estimation_graph(self):
        self.init_obs_ph = tf.compat.v1.placeholder(dtype=tf.float32,
                                                    shape=[None, self.obs_dim])
        self.value_estimation = tf.reduce_mean(
            input_tensor=self.create_value_func(
                self.init_obs_ph, None, func_type='v', reuse=True))

    def create_loss_func(self):
        error = self.rew_ph + self.gamma * self.v_next - self.q
        error_2 = self.rew_ph_2 + self.gamma * self.v_next_2 - self.q_2

        diff = tf.expand_dims(self.obs_act, 1) - tf.expand_dims(
            self.obs_act_2, 0)
        K = tf.exp(-tf.reduce_sum(input_tensor=tf.square(diff), axis=-1) /
                   2.0 / self.med_dist**2)

        all_vars = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope)

        self.loss = tf.matmul(tf.matmul(tf.transpose(a=error), K), error_2)

        self.reg_loss = self.reg_factor * tf.reduce_sum(
            input_tensor=tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, self.scope))
        self.debug_q.update({'reg loss': self.reg_loss})
        self.loss += self.reg_loss
        self.loss = tf.squeeze(self.loss)

        sample_num = tf.cast(
            tf.shape(input=K)[0] * tf.shape(input=K)[1], tf.float32)
        self.loss /= sample_num

        self.opt = tf.compat.v1.train.AdamOptimizer(self.lr)
        self.train_op = self.opt.minimize(self.loss, var_list=all_vars)

        self.trainable_vars += self.opt.variables()

    def train(self, data):
        # debug, loss, _ = tf.get_default_session().run(
        debug, loss, _ = self._session.run(
            [self.debug_q, self.loss, self.train_op],
            feed_dict={
                self.obs_ph: data['obs_1'],
                self.obs_ph_2: data['obs_2'],
                self.next_obs_ph: data['next_obs_1'],
                self.next_obs_ph_2: data['next_obs_2'],
                self.act_ph: data['act_1'],
                self.act_ph_2: data['act_2'],
                self.rew_ph: data['rew_1'],
                self.rew_ph_2: data['rew_2'],
                self.q_net.tau_ph: self.q_net.temperature,
            })

        return debug, loss

    def evaluation(self, init_obs):
        # value = tf.get_default_session().run(
        value = self._session.run(self.value_estimation,
                                  feed_dict={
                                      self.init_obs_ph: init_obs,
                                      self.q_net.tau_ph:
                                      self.q_net.temperature,
                                  })
        return value

    def close(self):
        self._session.close()
Exemple #6
0
class MWL(Basic_Alg):
    def __init__(self, obs_dim, act_dim, *, seed,
                       norm, model_weights, target_temp, med_dist,
                       hidden_layers_q, activation_q, hidden_layers_w=[32, 32], scope='mwl',
                       lr=5e-3, reg_factor=0, gamma=0.999):
        super().__init__(scope)

        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.gamma = gamma
        self.lr = lr
        self.reg_factor = reg_factor
        self.hidden_layers_w = hidden_layers_w
        self.hidden_layers_q = hidden_layers_q
        # import pdb; pdb.set_trace()

        self.med_dist = med_dist
        self.seed = seed
        self.norm = norm
        assert self.norm['type'] is not None, 'data should already be processed before calling the algorithm'

        self._session = tf.compat.v1.Session()
        self._session.__enter__()
        with self._session.as_default():
            self.q_net = Q_Model_Tf(obs_dim, act_dim, hidden_layers = hidden_layers_q, temperature=target_temp, seed = self.seed,\
                activation = activation_q)

        #XXX debug
        self.debug_w = {}

        self.trainable_vars = []

        self.build_graph()
        self.build_estimation_graph()
        self.create_loss_func()
        
        self._session.run([tf.compat.v1.variables_initializer(self.trainable_vars)])
        self.q_net.load_weight(model_weights)
        # tf.get_default_session().run(
        #     [tf.variables_initializer(self.trainable_vars)]
        # )
    
    def build_graph(self):
        self.rew_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 1])
        self.done_ph = tf.compat.v1.placeholder(dtype=tf.bool, shape=[None, 1])
        ''' Initial Part '''
        # ''' firs sample '''
        self.init_obs_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim])
        self.init_act_b = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1])
        self.init_act_e = self.build_action(self.init_obs_ph)

        self.init_obs_act_b = tf.concat([self.init_obs_ph, tf.cast(self.init_act_b, tf.float32)], axis=1)     
        self.init_obs_act_e = tf.concat([self.init_obs_ph, tf.cast(self.init_act_e, tf.float32)], axis=1)           
        
        # ''' second sample '''
        self.init_obs_ph_2 = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim])
        self.init_act_b_2 = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1])
        self.init_act_e_2 = self.build_action(self.init_obs_ph_2)

        self.init_obs_act_b_2 = tf.concat([self.init_obs_ph_2, tf.cast(self.init_act_b_2, tf.float32)], axis=1)     
        self.init_obs_act_e_2 = tf.concat([self.init_obs_ph_2, tf.cast(self.init_act_e_2, tf.float32)], axis=1)           
        
        ''' Current Part '''
        # first sample
        self.obs_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim])
        self.act_ph = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1])
        self.obs_act = tf.concat([self.obs_ph, tf.cast(self.act_ph, tf.float32)], axis=1)
        self.factor = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 1])
        
        # second sample
        self.obs_ph_2 = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim])
        self.act_ph_2 = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1])
        self.obs_act_2 = tf.concat([self.obs_ph_2, tf.cast(self.act_ph_2, tf.float32)], axis=1)
        self.factor_2 = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 1])

        ''' Next Part '''
        # first sample
        self.next_obs_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim])  
        self.next_act_b = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1])    
        self.next_obs_act_b = tf.concat([self.next_obs_ph, \
                tf.cast(self.next_act_b, tf.float32)], axis=1)
        self.next_act_e = [
            tf.zeros([tf.shape(input=self.next_obs_ph)[0], 1], dtype=tf.int32),
            tf.ones([tf.shape(input=self.next_obs_ph)[0], 1], dtype=tf.int32),
        ]
        self.next_obs_act_e = [
            tf.concat([self.next_obs_ph, tf.cast(self.next_act_e[0], tf.float32)], axis=1),
            tf.concat([self.next_obs_ph, tf.cast(self.next_act_e[1], tf.float32)], axis=1),
        ]
        self.prob_next = self.build_prob(self.next_obs_ph)

        # second part
        self.next_obs_ph_2 = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim])
        self.next_act_b_2 = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1])    
        self.next_obs_act_b_2 = tf.concat([self.next_obs_ph_2, \
                tf.cast(self.next_act_b_2, tf.float32)], axis=1)
        self.next_act_e_2 = [
            tf.zeros([tf.shape(input=self.next_obs_ph_2)[0], 1], dtype=tf.int32),
            tf.ones([tf.shape(input=self.next_obs_ph_2)[0], 1], dtype=tf.int32),
        ]
        self.next_obs_act_e_2 = [
            tf.concat([self.next_obs_ph_2, tf.cast(self.next_act_e_2[0], tf.float32)], axis=1),
            tf.concat([self.next_obs_ph_2, tf.cast(self.next_act_e_2[1], tf.float32)], axis=1),
        ]
        self.prob_next_2 = self.build_prob(self.next_obs_ph_2)


        ''' Density Ratio '''
        # first part   
        self.w = self.create_density_ratio(self.obs_ph, self.act_ph, factor=self.factor, reuse=False)
        self.w_next = self.create_density_ratio(self.next_obs_ph, self.next_act_b, factor=self.factor, reuse=True)
        
        # second part     
        self.w_2 = self.create_density_ratio(self.obs_ph_2, self.act_ph_2, factor=self.factor_2, reuse=True)
        self.w_next_2 = self.create_density_ratio(self.next_obs_ph_2, self.next_act_b_2, factor=self.factor_2, reuse=True)

        self.w_init = self.create_density_ratio(self.init_obs_ph, self.init_act_b, reuse=True, normalize=True)
        self.w_init_2 = self.create_density_ratio(self.init_obs_ph_2, self.init_act_b_2, reuse=True, normalize=True)

    def build_action(self, obs_ph):
        # recover the original obs, in order to get the correct action prob
        if self.norm['type'] is not None:
            org_obs = obs_ph * self.norm['scale'] + self.norm['shift']
        else:
            org_obs = obs_ph
        act = self.q_net.build_random_policy(org_obs, reuse=True)
        return tf.stop_gradient(act)

    def build_prob(self, obs_ph):
        # recover the original obs, in order to get the correct action prob
        if self.norm['type'] is not None:
            org_obs = obs_ph * self.norm['scale'] + self.norm['shift']
        else:
            org_obs = obs_ph

        return self.q_net.build_prob(org_obs, reuse=True)

    def create_density_ratio(self, obs_tf, act_tf, *, factor=None, reuse=False, normalize=True):
        with tf.compat.v1.variable_scope(self.scope, reuse=reuse):
            x = tf.concat([obs_tf, tf.cast(act_tf, tf.float32)], axis=1)
            for h in self.hidden_layers_w:
                x = tf.compat.v1.layers.dense(x, h, activation=tf.nn.relu)

            w = tf.compat.v1.layers.dense(x, 1, activation=None, kernel_regularizer=tf.keras.regularizers.l2(l=0.5 * (1.0)))

            if reuse == False:
                self.trainable_vars += tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope)

            w = tf.math.log(1 + tf.exp(w))

            if factor is not None:
                w = w * factor

            if normalize:
                w = w / tf.reduce_mean(input_tensor=w)
                
            return w

    def build_estimation_graph(self):
        rew = self.rew_ph
        w = self.create_density_ratio(self.obs_ph, self.act_ph, factor=self.factor, reuse=True, normalize=True)

        assert self.gamma < 1.0
        self.value_estimation = tf.reduce_mean(input_tensor=w * rew) / (1 - self.gamma)
        
    ''' 
        create loss function, drop those term which do not depend pn w(s,a)
    '''
    def create_loss_func(self):
        coeff = [self.gamma ** 2] * 4 + \
                [self.gamma ** 2] * 1 + \
                [(1-self.gamma) ** 2] * 1 + \
                [(1-self.gamma) ** 2] * 1 + \
                [-self.gamma ** 2] * 4 + \
                [-self.gamma * (1 - self.gamma)] * 4 + \
                [self.gamma * (1 - self.gamma)] * 4 + \
                [self.gamma * (1 - self.gamma)] * 2 + \
                [-self.gamma * (1 - self.gamma)] * 2 + \
                [-(1 - self.gamma)**2] * 2

        Kernel = [
            # Term 1
            (self.next_obs_act_e[0], self.next_obs_act_e_2[0]),
            (self.next_obs_act_e[0], self.next_obs_act_e_2[1]),
            (self.next_obs_act_e[1], self.next_obs_act_e_2[0]),
            (self.next_obs_act_e[1], self.next_obs_act_e_2[1]),
            # Term 2
            (self.next_obs_act_b, self.next_obs_act_b_2),
            # Term 3
            (self.init_obs_act_b, self.init_obs_act_b_2),
            # Term 4
            (self.init_obs_act_e, self.init_obs_act_e_2),
            # Term 5
            (self.next_obs_act_e[0], self.next_obs_act_b_2),
            (self.next_obs_act_e[1], self.next_obs_act_b_2),
            (self.next_obs_act_b, self.next_obs_act_e_2[0]),
            (self.next_obs_act_b, self.next_obs_act_e_2[1]),
            # Term 6
            (self.next_obs_act_e[0], self.init_obs_act_b_2),
            (self.next_obs_act_e[1], self.init_obs_act_b_2),
            (self.init_obs_act_b, self.next_obs_act_e_2[0]),
            (self.init_obs_act_b, self.next_obs_act_e_2[1]),
            # Term 7
            (self.next_obs_act_e[0], self.init_obs_act_e_2),
            (self.next_obs_act_e[1], self.init_obs_act_e_2),
            (self.init_obs_act_e, self.next_obs_act_e_2[0]),
            (self.init_obs_act_e, self.next_obs_act_e_2[1]),
            # Term 8
            (self.next_obs_act_b, self.init_obs_act_b_2),
            (self.init_obs_act_b, self.next_obs_act_b_2),
            # Term 9
            (self.next_obs_act_b, self.init_obs_act_e_2),
            (self.init_obs_act_e, self.next_obs_act_b_2),
            # Term 10
            (self.init_obs_act_b, self.init_obs_act_e_2),
            (self.init_obs_act_e, self.init_obs_act_b_2),
        ]

        prob_mask = [
                # Term 1
                self.prob_next[0] * tf.reshape(self.prob_next_2[0], [1, -1]),
                self.prob_next[0] * tf.reshape(self.prob_next_2[1], [1, -1]),
                self.prob_next[1] * tf.reshape(self.prob_next_2[0], [1, -1]),
                self.prob_next[1] * tf.reshape(self.prob_next_2[1], [1, -1]),
            ] + \
            [ 
                # Term 2, 3, 4
                None
            ] * 3 + \
            [ 
                # Term 5, 6, 7
                self.prob_next[0] * tf.ones([1, tf.shape(input=self.prob_next_2[0])[0]]),
                self.prob_next[1] * tf.ones([1, tf.shape(input=self.prob_next_2[0])[0]]),
                tf.ones(tf.shape(input=self.prob_next[0])) * tf.reshape(self.prob_next_2[0], [1, -1]),
                tf.ones(tf.shape(input=self.prob_next[0])) * tf.reshape(self.prob_next_2[1], [1, -1]),
            ] + \
            [ 
                # Term 5, 6, 7
                self.prob_next[0] * tf.ones([1, tf.shape(input=self.w_init_2)[0]]),
                self.prob_next[1] * tf.ones([1, tf.shape(input=self.w_init_2)[0]]),
                tf.ones(tf.shape(input=self.w_init)) * tf.reshape(self.prob_next_2[0], [1, -1]),
                tf.ones(tf.shape(input=self.w_init)) * tf.reshape(self.prob_next_2[1], [1, -1]),
            ] * 2 + \
            [
                # Term 8, 9, 10
                None
            ] * 6

        w_ones = tf.ones(tf.shape(input=self.w_init))
        w_2_ones = tf.ones(tf.shape(input=self.w_init_2))
        weights = [
            # Term 1
            (self.w, self.w_2),
            (self.w, self.w_2),
            (self.w, self.w_2),
            (self.w, self.w_2),
            # Term 2
            (self.w_next, self.w_next_2),
            # Term 3
            (self.w_init, self.w_init_2),
            # Term 4
            None,
            # Term 5
            (self.w, self.w_next_2),
            (self.w, self.w_next_2),
            (self.w_next, self.w_2),
            (self.w_next, self.w_2),
            # Term 6
            (self.w, self.w_init_2),
            (self.w, self.w_init_2),
            (self.w_init, self.w_2),
            (self.w_init, self.w_2),
            # Term 7
            (self.w, w_2_ones),
            (self.w, w_2_ones),
            (w_ones, self.w_2),
            (w_ones, self.w_2),
            # Term 8
            (self.w_next, self.w_init_2),
            (self.w_init, self.w_next_2),
            # Term 9
            (self.w_next, w_2_ones),
            (w_ones, self.w_next_2),
            # Term 10
            (self.w_init, w_2_ones),
            (w_ones, self.w_init_2),
        ]

        self.reg_loss = self.reg_factor * tf.reduce_sum(input_tensor=tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, self.scope))
        self.debug_w.update({'reg loss': self.reg_loss})
        self.loss = self.reg_loss

        K = 0
        for index in range(len(Kernel)):
            if weights[index] is None:
                continue

            k1, k2 = Kernel[index]
            c = coeff[index]
            w1, w2 = weights[index]

            diff = tf.expand_dims(k1, 1) - tf.expand_dims(k2, 0)
            K = tf.exp(-tf.reduce_sum(input_tensor=tf.square(diff), axis=-1)/2.0/self.med_dist[index] ** 2)
            if prob_mask[index] is not None:
                K = K * prob_mask[index]
            sample_num = tf.cast(tf.shape(input=K)[0] * tf.shape(input=K)[1], tf.float32)
            loss = c * tf.matmul(tf.matmul(tf.transpose(a=w1), K), w2) / sample_num

            self.loss += tf.squeeze(loss)

        self.opt = tf.compat.v1.train.AdamOptimizer(self.lr)
        self.train_op = self.opt.minimize(self.loss)

        self.trainable_vars += self.opt.variables()


    def train(self, data):
        # debug, loss, _ = tf.get_default_session().run(
        debug, loss, _ = self._session.run(            
            [self.debug_w, self.loss, self.train_op],
            feed_dict={
                self.obs_ph: data['obs_1'],  
                self.act_ph: data['acts_1'],  
                self.next_obs_ph: data['next_obs_1'],
                self.next_act_b: data['next_acts_1'],
                self.init_obs_ph: data['init_obs_1'],
                self.init_act_b: data['init_acts_1'],
                self.factor: data['factor_1'],
                self.obs_ph_2: data['obs_2'],
                self.act_ph_2: data['acts_2'],
                self.next_obs_ph_2: data['next_obs_2'],
                self.next_act_b_2: data['next_acts_2'],
                self.init_obs_ph_2: data['init_obs_2'],
                self.init_act_b_2: data['init_acts_2'],
                self.factor_2: data['factor_2'],
                self.q_net.tau_ph: self.q_net.temperature,
            }
        )
        return debug, loss

    def evaluation(self, obs, acts, factor, rew):
        # value = tf.get_default_session().run(
        value = self._session.run(
            self.value_estimation,
            feed_dict={
                self.obs_ph: obs,
                self.act_ph: acts,
                self.rew_ph: rew,
                self.factor: factor,
                self.q_net.tau_ph: self.q_net.temperature,
            }
        )
        return value

    def get_w(self, obs, acts, factor, rew):
        # w = tf.get_default_session().run(
        w = self._session.run(
            self.w,
            feed_dict={
                self.obs_ph: obs,
                self.act_ph: acts,
                self.rew_ph: rew,
                self.factor: factor,
                self.q_net.tau_ph: self.q_net.temperature,
            }
        )
        return w

    def close(self):
        self._session.close()