Example #1
0
    def unroll2(self, seed_states):
        assert seed_states.shape.as_list() == [None, self.state_dim]
        no_samples = self.no_samples
        unroll_steps = self.unroll_steps
        #self.reward_model = real_env_pendulum_reward()#Use true model.
        self.reward_model = ANN(self.state_dim + self.action_dim, 1)
        self.placeholders_reward = [
            tf.placeholder(shape=v.shape, dtype=tf.float64)
            for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.reward_model.scope)
        ]
        self.assign_ops = [
            v.assign(pl) for v, pl in zip(
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                  self.reward_model.scope),
                self.placeholders_reward)
        ]

        states = tf.expand_dims(seed_states, axis=1)
        states = tf.tile(states, [1, no_samples, 1])
        states = tf.reshape(states, shape=[-1, self.state_dim])

        costs = []
        self.next_states = []
        for unroll_step in range(unroll_steps):
            actions = self.build_policy(states)

            rewards = (self.discount_factor**
                       unroll_step) * self.reward_model.build(states, actions)
            rewards = tf.reshape(tf.squeeze(rewards, axis=-1),
                                 shape=[-1, no_samples])
            costs.append(-rewards)

            states_actions = tf.concat([states, actions], axis=-1)

            next_states = self.get_next_states2(states_actions)
            self.next_states.append(next_states)
            states = next_states

        costs = tf.stack(costs, axis=-1)
        self.loss = tf.reduce_mean(
            tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1))
        self.opt = tf.train.AdamOptimizer().minimize(
            self.loss,
            var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       'policy_scope'))
Example #2
0
    def __init__(self,
                 environment,
                 x_dim,
                 y_dim,
                 state_dim,
                 action_dim,
                 observation_space_low,
                 observation_space_high,
                 action_space_low,
                 action_space_high,
                 unroll_steps,
                 no_samples,
                 discount_factor,
                 random_matrices,
                 biases,
                 basis_dims,
                 hidden_dim=32,
                 learn_reward=0,
                 use_mean_reward=0,
                 update_hyperstate=1,
                 policy_use_hyperstate=1,
                 learn_diff=0):
        #assert environment in ['Pendulum-v0', 'MountainCarContinuous-v0']
        assert x_dim == state_dim + action_dim
        assert len(action_space_low.shape) == 1
        np.testing.assert_equal(-action_space_low, action_space_high)
        self.environment = environment
        self.x_dim = x_dim
        self.y_dim = y_dim
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.observation_space_low = observation_space_low
        self.observation_space_high = observation_space_high
        self.action_space_low = action_space_low
        self.action_space_high = action_space_high

        self.unroll_steps = unroll_steps
        self.no_samples = no_samples
        self.discount_factor = discount_factor
        self.random_matrices = random_matrices
        self.biases = biases
        self.basis_dims = basis_dims
        self.hidden_dim = hidden_dim
        self.learn_reward = learn_reward
        self.use_mean_reward = use_mean_reward
        self.update_hyperstate = update_hyperstate
        self.policy_use_hyperstate = policy_use_hyperstate
        self.learn_diff = learn_diff

        if self.environment == 'Pendulum-v0' and self.learn_reward == 0:
            #self.reward_function = real_env_pendulum_reward()
            self.reward_function = ANN(self.state_dim + self.action_dim, 1)
            self.placeholders_reward = [
                tf.placeholder(shape=v.shape, dtype=tf.float64)
                for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           self.reward_function.scope)
            ]
            self.assign_ops0 = [
                v.assign(pl) for v, pl in zip(
                    tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      self.reward_function.scope),
                    self.placeholders_reward)
            ]
        elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0:
            self.reward_function = mountain_car_continuous_reward_function()

        #self.hyperstate_dim = sum([(basis_dim*(basis_dim+1))/2 + basis_dim for basis_dim in self.basis_dims])
        self.hyperstate_dim = sum(
            [basis_dim * (basis_dim + 1) for basis_dim in self.basis_dims])

        self.random_projection_matrix = np.random.normal(
            loc=0.,
            scale=1. / np.sqrt(self.state_dim),
            size=[self.hyperstate_dim, self.state_dim])

        input_dim = self.state_dim
        if self.policy_use_hyperstate == 1:
            input_dim *= 2

        self.w1 = np.concatenate([
            np.random.normal(size=[input_dim, self.hidden_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim])
        ],
                                 axis=0)
        self.w2 = np.concatenate([
            np.random.normal(size=[self.hidden_dim, self.hidden_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim])
        ],
                                 axis=0)
        self.w3 = np.concatenate([
            np.random.normal(size=[self.hidden_dim, self.action_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.action_dim])
        ],
                                 axis=0)

        self.thetas = self._pack([self.w1, self.w2, self.w3])

        self.sizes = [[input_dim + 1, self.hidden_dim],
                      [self.hidden_dim + 1, self.hidden_dim],
                      [self.hidden_dim + 1, self.action_dim]]

        w1, w2, w3 = self._unpack(self.thetas, self.sizes)
        np.testing.assert_equal(w1, self.w1)
        np.testing.assert_equal(w2, self.w2)
        np.testing.assert_equal(w3, self.w3)
Example #3
0
    def unroll(self, seed_states):
        assert seed_states.shape.as_list() == [None, self.state_dim]
        no_samples = self.no_samples
        unroll_steps = self.unroll_steps
        #self.reward_model = real_env_pendulum_reward()#Use true model.
        self.reward_model = ANN(self.state_dim + self.action_dim, 1)
        self.placeholders_reward = [
            tf.placeholder(shape=v.shape, dtype=tf.float64)
            for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.reward_model.scope)
        ]
        self.assign_ops = [
            v.assign(pl) for v, pl in zip(
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                  self.reward_model.scope),
                self.placeholders_reward)
        ]

        states = tf.expand_dims(seed_states, axis=1)
        states = tf.tile(states, [1, no_samples, 1])
        states = tf.reshape(states, shape=[-1, self.state_dim])

        self.mus0 = []
        self.sigmas0 = []
        self.mus1 = []
        self.sigmas1 = []
        self.mus2 = []
        self.sigmas2 = []

        costs = []
        self.next_states = []
        #ns = []
        #bs = []
        for unroll_step in range(unroll_steps):
            print 'unrolling:', unroll_step
            if self.debugging_plot == True:
                actions = self.build_policy2(states)
            else:
                actions = self.build_policy(states)

            # Reward
            rewards = (self.discount_factor**
                       unroll_step) * self.reward_model.build(states, actions)
            rewards = tf.reshape(tf.squeeze(rewards, axis=-1),
                                 shape=[-1, no_samples])
            costs.append(-rewards)

            states_actions = tf.concat([states, actions], axis=-1)
            mus, sigmas = zip(*[
                self.mu_sigma(self.cum_xx[y], self.cum_xy[y], self.models[y].s,
                              self.models[y].noise_sd)
                for y in range(self.y_dim)
            ])

            bases = [
                model.approx_rbf_kern_basis(states_actions)
                for model in self.models
            ]
            #bs.append(bases)
            mu_pred, sigma_pred = [
                tf.concat(e, axis=-1) for e in zip(*[
                    self.prediction(mu, sigma, basis, model.noise_sd) for mu,
                    sigma, basis, model in zip(mus, sigmas, bases, self.models)
                ])
            ]

            self.mus0.append(mu_pred)
            self.sigmas0.append(sigma_pred)
            self.get_next_states(states_actions)
            self.get_next_states2(states_actions)

            next_states = tfd.MultivariateNormalDiag(
                loc=mu_pred, scale_diag=tf.sqrt(sigma_pred)).sample()
            #ns.append(tf.split(next_states, self.y_dim, axis=-1))

            self.next_states.append(
                tf.reshape(next_states, shape=[-1, no_samples,
                                               self.state_dim]))

            for y in range(self.y_dim):
                self.update_posterior(bases[y], next_states[..., y:y + 1], y)

            states = next_states

        if self.debugging_plot == False:
            print 'here1'
            costs = tf.stack(costs, axis=-1)
            print 'here2'
            self.loss = tf.reduce_mean(
                tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1))
            print 'here3'
            self.opt = tf.train.AdamOptimizer().minimize(
                self.loss,
                var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           'policy_scope'))
            print 'here4'
        self.string = 'unroll'
Example #4
0
    def __init__(self, state_dim, action_dim, action_bound_high, \
                 action_bound_low, unroll_length, discount_factor, \
                 gradient_descent_steps, scope):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound_high = action_bound_high
        self.action_bound_low = action_bound_low
        self.unroll_length = unroll_length
        self.discount_factor = discount_factor
        self.gradient_descent_steps = gradient_descent_steps
        self.scope = scope

        #Make sure bounds are same (assumption can be relaxed later)
        np.testing.assert_array_equal(-self.action_bound_low,
                                      self.action_bound_high)

        #Flags
        self.policy_reuse_vars = None
        '''
        self.reward_model = ANN(self.state_dim+self.action_dim, 1)
        self.placeholders_reward = [tf.placeholder(shape=v.shape, dtype=tf.float64)
                                    for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope)]
        self.assign_ops0 = [v.assign(pl) for v, pl in zip(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope),
                            self.placeholders_reward)]
        '''
        #self.reward_model = real_env_pendulum_reward()
        self.reward_model = mountain_car_continuous_reward_function()

        #self.state_model = real_env_pendulum_state()
        #self.state_model = mountain_car_continuous_state_function()
        self.state_model = ANN(self.state_dim + self.action_dim,
                               self.state_dim)
        self.placeholders_state = [
            tf.placeholder(shape=v.shape, dtype=tf.float64)
            for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.state_model.scope)
        ]
        self.assign_ops1 = [
            v.assign(pl) for v, pl in zip(
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.
                                  state_model.scope), self.placeholders_state)
        ]

        #Build computational graph (i.e., unroll policy)
        #self.states = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float32)
        self.states = tf.placeholder(shape=[None, self.state_dim],
                                     dtype=tf.float64)

        self.action = self.build_policy(self.states)
        state = self.states
        action = self.build_policy(state)
        rewards = []
        for i in range(self.unroll_length):
            print i
            #reward = pow(self.discount_factor, i) * self.reward_model.build(state, action)
            #reward = pow(self.discount_factor, i) * self.reward_model.step_tf(state, action)
            reward = pow(self.discount_factor,
                         i) * self.reward_model.sigmoid_approx(state, action)
            rewards.append(reward)
            state = self.state_model.build(state, action)
            #state = self.state_model.step_tf(state, action)
            action = self.build_policy(state)

        rewards = tf.reduce_sum(tf.stack(rewards, axis=-1), axis=-1)
        print 'here0'
        self.loss = -tf.reduce_mean(tf.reduce_sum(rewards, axis=-1))
        print 'here1'
        self.opt = tf.train.AdamOptimizer().minimize(
            self.loss,
            var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.scope))
        print 'here2'