Example #1
0
    def unroll(self, seed_states):
        assert seed_states.shape.as_list() == [None, self.state_dim]
        no_samples = self.no_samples
        unroll_steps = self.unroll_steps
        self.reward_model = real_env_pendulum_reward()#Use true model.

        states = tf.expand_dims(seed_states, axis=1)
        states = tf.tile(states, [1, no_samples, 1])
        states = tf.reshape(states, shape=[-1, self.state_dim])

        costs = []
        for unroll_step in range(unroll_steps):
            actions = self.build_policy(states)

            rewards = (self.discount_factor ** unroll_step) * self.reward_model.build(states, actions)
            rewards = tf.reshape(tf.squeeze(rewards, axis=-1), shape=[-1, no_samples])
            costs.append(-rewards)

            states_actions = tf.concat([states, actions], axis=-1)

            next_states = self.get_next_states(states_actions)
            states = next_states

        costs = tf.stack(costs, axis=-1)
        self.loss = tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1))
        self.opt = tf.train.AdamOptimizer().minimize(self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy_scope'))
    def build_loss(self, trajectories):
        no_samples = 2
        self.reward_model = real_env_pendulum_reward()  #Use true model.

        costs = []
        for i in range(len(trajectories)):
            samples_standard_normal = tf.random_normal(
                shape=([self.batch_size] +
                       trajectories[i].shape.as_list()[1:-1] + [no_samples]),
                dtype=tf.float64)
            #samples_standard_normal = tf.random_normal(shape=tf.shape(tf.placeholder(shape=(trajectories[i].shape.as_list()[:-1] + [no_samples]), dtype=tf.float64)), dtype=tf.float64)

            samples = trajectories[i][..., 0:1] + tf.sqrt(
                trajectories[i][..., 1:2]) * samples_standard_normal
            samples_transposed = tf.transpose(samples, perm=[0, 2, 1])
            samples_transposed_reshaped = tf.reshape(
                samples_transposed, shape=[-1, self.state_dim])

            rewards = (self.discount_factor**i) * self.reward_model.build(
                samples_transposed_reshaped,
                self.build_policy(samples_transposed_reshaped))
            rewards_reshaped = tf.reshape(rewards, shape=[-1, no_samples, 1])
            costs.append(-tf.reduce_mean(tf.squeeze(rewards_reshaped, axis=-1),
                                         axis=-1))

        loss = tf.reduce_mean(tf.reduce_sum(tf.stack(costs, axis=-1), axis=-1))
        return loss
Example #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", type=str, default='MountainCarContinuous-v0')
    parser.add_argument("--data-size", type=int, default=10000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--iterations", type=int, default=5000)
    #parser.add_argument("--goal-position", type=float, default=-.4)
    args = parser.parse_args()

    print(args)

    env = gym.make(args.env)
    ann = ANN(env.observation_space.shape[0] + env.action_space.shape[0],
              1,
              train_weights=True)

    reward_function = real_env_pendulum_reward()
    #state_function = real_env_pendulum_state()

    high = np.array([np.pi, 1.])
    states = np.random.uniform(low=-high,
                               high=high,
                               size=[args.data_size, len(high)])
    states = np.stack(
        [np.cos(states[:, 0]),
         np.sin(states[:, 0]), states[:, 1]], axis=-1)
    actions = np.random.uniform(
        env.action_space.low,
        env.action_space.high,
        size=[args.data_size, env.action_space.shape[0]])

    rewards = reward_function.step_np(states, actions)

    #saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for it in range(args.iterations):
            for i in range(0, args.data_size, args.batch_size):
                inputs = np.concatenate([
                    states[i:i + args.batch_size, ...],
                    actions[i:i + args.batch_size, ...]
                ],
                                        axis=-1)
                targets = rewards[i:i + args.batch_size, ...]
                loss, _ = sess.run([ann.loss, ann.opt],
                                   feed_dict={
                                       ann.inputs: inputs,
                                       ann.targets: targets
                                   })
                if it % 1000 == 0:
                    print('iterations:', it, 'i:', i, 'loss:', loss)

        #print sess.run(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))
        #saver.save(sess, './weights/pendulum_reward.ckpt')
        pickle.dump(
            sess.run(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)),
            open('./weights/pendulum_reward.p', 'wb'))
    def __init__(self, env, state_dim, action_dim, action_space_high,
                 action_space_low, batch_size, unroll_steps, discount_factor):

        #self.X = np.linspace(-2., 2., self.batch_size)
        #self.y = np.sin(self.X) + 5e-5 * np.random.randn(self.batch_size)

        #self.Xin = np.concatenate([self.X[..., np.newaxis], np.ones([self.batch_size, 1])], axis=-1)
        assert len(action_space_low.shape) == 1
        np.testing.assert_equal(-action_space_low, action_space_high)

        self.action_space_high = action_space_high
        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hidden_dim = 32

        self.h1 = np.random.normal(size=[self.state_dim, self.hidden_dim])
        self.h2 = np.concatenate([
            np.random.normal(size=[self.hidden_dim, self.hidden_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim])
        ],
                                 axis=0)
        self.o = np.concatenate([
            np.random.normal(size=[self.hidden_dim, self.action_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.action_dim])
        ],
                                axis=0)

        self.thetas = np.concatenate(
            [self.h1.flatten(),
             self.h2.flatten(),
             self.o.flatten()])

        self.uuid = str(uuid.uuid4())
        self.batch_size = batch_size
        self.unroll_steps = unroll_steps
        self.discount_factor = discount_factor
        if self.env == 'MountainCarContinuous-v0':
            self.reward_function = mountain_car_continuous_reward_function(
                goal_position=.45)
            self.state_function = mountain_car_continuous_state_function()
        elif self.env == 'Pendulum-v0':
            self.reward_function = real_env_pendulum_reward()
            self.state_function = real_env_pendulum_state()
        self.it = 0
Example #5
0
    def __init__(self, state_dim, action_dim, action_bound_high, \
                 action_bound_low, unroll_length, discount_factor, \
                 gradient_descent_steps, scope):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound_high = action_bound_high
        self.action_bound_low = action_bound_low
        self.unroll_length = unroll_length
        self.discount_factor = discount_factor
        self.gradient_descent_steps = gradient_descent_steps
        self.scope = scope

        #Make sure bounds are same (assumption can be relaxed later)
        np.testing.assert_array_equal(-self.action_bound_low,
                                      self.action_bound_high)

        #Flags
        self.policy_reuse_vars = None

        #Build computational graph (i.e., unroll policy)
        self.states = tf.placeholder(shape=[None, self.state_dim],
                                     dtype=tf.float32)
        self.policy = self.build_policy(self.states)
        self.state_model = real_env_pendulum_state()
        self.reward_model = real_env_pendulum_reward()

        self.action = self.build_policy(self.states)
        state = self.states
        action = self.build_policy(state)
        rewards = []
        for i in range(self.unroll_length):
            reward = pow(self.discount_factor, i) * self.reward_model.build(
                state, action)
            rewards.append(reward)
            state = self.state_model.build(state, action)
            action = self.build_policy(state)

        rewards = tf.reduce_sum(tf.stack(rewards, axis=-1), axis=-1)
        self.loss = -tf.reduce_mean(tf.reduce_sum(rewards, axis=-1))
        self.opt = tf.train.AdamOptimizer().minimize(self.loss)
        '''
Example #6
0
    def __init__(self,
                 environment,
                 state_size,
                 action_size,
                 hidden_size,
                 it_tloop,
                 it_dyn,
                 bs_dyn,
                 it_policy,
                 bs_policy,
                 K,
                 T,
                 action_bound_high,
                 action_bound_low,
                 discount_factor,
                 moment_matching=True,
                 scope='pai'):
        self.environment = environment
        self.state_size = state_size
        self.action_size = action_size
        self.hidden_size = hidden_size

        self.it_tloop = it_tloop
        self.it_dyn = it_dyn
        self.bs_dyn = bs_dyn
        self.it_policy = it_policy
        self.bs_policy = bs_policy

        self.K = K  #Number of particles
        assert self.bs_policy == self.K  #Does this have to be true?
        self.T = T  #Time horizon

        self.action_bound_high = action_bound_high
        self.action_bound_low = action_bound_low
        self.discount_factor = discount_factor

        self.moment_matching = moment_matching
        self.scope = scope

        self.policy_reuse_vars = None

        # Assertion
        np.testing.assert_array_equal(-self.action_bound_low,
                                      self.action_bound_high)

        # Initialize the Bayesian neural network.
        self.bnn = bayesian_dynamics_model(self.state_size + self.action_size,
                                           self.state_size)
        self.bnn.initialize_inference(n_iter=self.it_tloop * self.it_dyn * 300,
                                      n_samples=10)

        # Declare variables and assignment operators for each W_k.
        self.assign_op = []
        for k in range(K):
            self.declare_vars_and_assign_op(scope='W_' + str(k) + '_')

        # True reward model
        self.reward_model = real_env_pendulum_reward()
        rewards = []

        # Predict x_t for t = 1,...,T.
        self.particles = tf.placeholder(shape=[self.K, self.state_size],
                                        dtype=tf.float32)
        self.action = self.build_policy(self.particles)
        particles = self.particles
        for t in range(T):
            actions = self.build_policy(particles)
            rewards.append((self.discount_factor**t) *
                           self.reward_model.build(particles, actions))
            states_actions = tf.concat([particles, actions], axis=-1)
            next_states = []
            for k in range(K):
                W_k = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        'W_' + str(k) + '_')
                next_state = self.bnn.build(
                    *([tf.expand_dims(states_actions[k, :], axis=0)] + W_k))
                next_states.append(next_state)
            next_states = tf.concat(next_states, axis=0)

            # Perform moment matching.
            mu, cov = self.mu_and_cov(next_states)
            cov = cov + 5e-5 * np.eye(
                self.state_size)  # To prevent singular matrix
            particles = tfd.MultivariateNormalFullCovariance(
                loc=mu, covariance_matrix=cov).sample(self.K)

        # Maximize cumulative rewards in horizon T.
        rewards = tf.reduce_sum(tf.stack(rewards, axis=-1), axis=-1)
        self.loss = -tf.reduce_mean(tf.reduce_sum(rewards, axis=-1))
        self.opt = tf.train.AdamOptimizer().minimize(self.loss)
    def unroll2(self, states):
        assert states.shape.as_list() == [None, self.state_dim]
        self.reward_model = real_env_pendulum_reward()  #Use true model.
        trajectories = [
            tf.tile(tf.expand_dims(states, axis=1), [1, self.no_samples, 1])
        ]
        costs = []

        # Action
        self.actions = self.build_policy(states)

        # Posterior predictive distributions
        rewards = self.reward_model.build(states, self.actions)
        costs.append(-rewards)
        states_actions = tf.concat([states, self.actions], axis=-1)
        ppd = tf.stack([
            self.model[i].posterior_predictive_distribution(states_actions, i)
            for i in range(len(self.model))
        ],
                       axis=1)
        particles = tfd.MultivariateNormalDiag(
            loc=ppd[..., 0],
            scale_diag=tf.sqrt(ppd[..., 1])).sample(self.no_samples)
        '''
        particles = self.get_next_states(states_actions)# For testing purposes!!
        '''

        for unroll_step in range(self.unroll_steps - 1):
            print 'unrolling step:', unroll_step
            particles_transposed = tf.transpose(particles, perm=[1, 0, 2])
            trajectories.append(particles_transposed)

            particles_transposed_flattened = tf.reshape(
                particles_transposed, shape=[-1, self.state_dim])
            actions = self.build_policy(particles_transposed_flattened)

            rewards = self.reward_model.build(particles_transposed_flattened,
                                              actions)
            rewards = tf.reshape(rewards, shape=[-1, self.no_samples, 1])
            rewards = tf.reduce_mean(
                pow(self.discount_factor, unroll_step + 1) * rewards, axis=1)
            costs.append(-rewards)

            states_actions = tf.concat(
                [particles_transposed_flattened, actions], axis=-1)
            ppd = tf.stack([
                self.model[i].posterior_predictive_distribution(
                    states_actions, i) for i in range(len(self.model))
            ],
                           axis=1)
            ppd = tf.reshape(ppd,
                             shape=[-1, self.no_samples, self.state_dim, 2])

            random_selections = np.random.multinomial(
                self.no_samples, [1. / self.no_samples] * self.no_samples)
            particles = []
            for i in range(len(random_selections)):
                if random_selections[i] > 0:
                    particles.append(
                        tfd.MultivariateNormalDiag(
                            loc=ppd[:, i, :, 0],
                            scale_diag=tf.sqrt(ppd[:, i, :, 1])).sample(
                                random_selections[i]))
            particles = tf.concat(particles, axis=0)
            '''
            particles = self.get_next_states(tf.reshape(states_actions, shape=[-1, self.no_samples, self.state_dim + self.action_dim])[:, 0, :])# For testing purposes!!
            '''

        particles_transposed = tf.transpose(particles, perm=[1, 0, 2])
        trajectories.append(particles_transposed)

        costs = tf.reduce_sum(tf.concat(costs, axis=-1), axis=-1)
        loss = tf.reduce_mean(costs)

        return trajectories, loss