Ejemplo n.º 1
0
    def unroll2(self, seed_states):
        assert seed_states.shape.as_list() == [None, self.state_dim]
        no_samples = self.no_samples
        unroll_steps = self.unroll_steps
        #self.reward_model = real_env_pendulum_reward()#Use true model.
        self.reward_model = ANN(self.state_dim + self.action_dim, 1)
        self.placeholders_reward = [
            tf.placeholder(shape=v.shape, dtype=tf.float64)
            for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.reward_model.scope)
        ]
        self.assign_ops = [
            v.assign(pl) for v, pl in zip(
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                  self.reward_model.scope),
                self.placeholders_reward)
        ]

        states = tf.expand_dims(seed_states, axis=1)
        states = tf.tile(states, [1, no_samples, 1])
        states = tf.reshape(states, shape=[-1, self.state_dim])

        costs = []
        self.next_states = []
        for unroll_step in range(unroll_steps):
            actions = self.build_policy(states)

            rewards = (self.discount_factor**
                       unroll_step) * self.reward_model.build(states, actions)
            rewards = tf.reshape(tf.squeeze(rewards, axis=-1),
                                 shape=[-1, no_samples])
            costs.append(-rewards)

            states_actions = tf.concat([states, actions], axis=-1)

            next_states = self.get_next_states2(states_actions)
            self.next_states.append(next_states)
            states = next_states

        costs = tf.stack(costs, axis=-1)
        self.loss = tf.reduce_mean(
            tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1))
        self.opt = tf.train.AdamOptimizer().minimize(
            self.loss,
            var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       'policy_scope'))
Ejemplo n.º 2
0
    def __init__(self,
                 environment,
                 x_dim,
                 y_dim,
                 state_dim,
                 action_dim,
                 observation_space_low,
                 observation_space_high,
                 action_space_low,
                 action_space_high,
                 unroll_steps,
                 no_samples,
                 discount_factor,
                 random_matrices,
                 biases,
                 basis_dims,
                 hidden_dim=32,
                 learn_reward=0,
                 use_mean_reward=0,
                 update_hyperstate=1,
                 policy_use_hyperstate=1,
                 learn_diff=0):
        #assert environment in ['Pendulum-v0', 'MountainCarContinuous-v0']
        assert x_dim == state_dim + action_dim
        assert len(action_space_low.shape) == 1
        np.testing.assert_equal(-action_space_low, action_space_high)
        self.environment = environment
        self.x_dim = x_dim
        self.y_dim = y_dim
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.observation_space_low = observation_space_low
        self.observation_space_high = observation_space_high
        self.action_space_low = action_space_low
        self.action_space_high = action_space_high

        self.unroll_steps = unroll_steps
        self.no_samples = no_samples
        self.discount_factor = discount_factor
        self.random_matrices = random_matrices
        self.biases = biases
        self.basis_dims = basis_dims
        self.hidden_dim = hidden_dim
        self.learn_reward = learn_reward
        self.use_mean_reward = use_mean_reward
        self.update_hyperstate = update_hyperstate
        self.policy_use_hyperstate = policy_use_hyperstate
        self.learn_diff = learn_diff

        if self.environment == 'Pendulum-v0' and self.learn_reward == 0:
            #self.reward_function = real_env_pendulum_reward()
            self.reward_function = ANN(self.state_dim + self.action_dim, 1)
            self.placeholders_reward = [
                tf.placeholder(shape=v.shape, dtype=tf.float64)
                for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           self.reward_function.scope)
            ]
            self.assign_ops0 = [
                v.assign(pl) for v, pl in zip(
                    tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      self.reward_function.scope),
                    self.placeholders_reward)
            ]
        elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0:
            self.reward_function = mountain_car_continuous_reward_function()

        #self.hyperstate_dim = sum([(basis_dim*(basis_dim+1))/2 + basis_dim for basis_dim in self.basis_dims])
        self.hyperstate_dim = sum(
            [basis_dim * (basis_dim + 1) for basis_dim in self.basis_dims])

        self.random_projection_matrix = np.random.normal(
            loc=0.,
            scale=1. / np.sqrt(self.state_dim),
            size=[self.hyperstate_dim, self.state_dim])

        input_dim = self.state_dim
        if self.policy_use_hyperstate == 1:
            input_dim *= 2

        self.w1 = np.concatenate([
            np.random.normal(size=[input_dim, self.hidden_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim])
        ],
                                 axis=0)
        self.w2 = np.concatenate([
            np.random.normal(size=[self.hidden_dim, self.hidden_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim])
        ],
                                 axis=0)
        self.w3 = np.concatenate([
            np.random.normal(size=[self.hidden_dim, self.action_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.action_dim])
        ],
                                 axis=0)

        self.thetas = self._pack([self.w1, self.w2, self.w3])

        self.sizes = [[input_dim + 1, self.hidden_dim],
                      [self.hidden_dim + 1, self.hidden_dim],
                      [self.hidden_dim + 1, self.action_dim]]

        w1, w2, w3 = self._unpack(self.thetas, self.sizes)
        np.testing.assert_equal(w1, self.w1)
        np.testing.assert_equal(w2, self.w2)
        np.testing.assert_equal(w3, self.w3)
Ejemplo n.º 3
0
class Agent:
    def __init__(self,
                 environment,
                 x_dim,
                 y_dim,
                 state_dim,
                 action_dim,
                 observation_space_low,
                 observation_space_high,
                 action_space_low,
                 action_space_high,
                 unroll_steps,
                 no_samples,
                 discount_factor,
                 random_matrices,
                 biases,
                 basis_dims,
                 hidden_dim=32,
                 learn_reward=0,
                 use_mean_reward=0,
                 update_hyperstate=1,
                 policy_use_hyperstate=1,
                 learn_diff=0):
        #assert environment in ['Pendulum-v0', 'MountainCarContinuous-v0']
        assert x_dim == state_dim + action_dim
        assert len(action_space_low.shape) == 1
        np.testing.assert_equal(-action_space_low, action_space_high)
        self.environment = environment
        self.x_dim = x_dim
        self.y_dim = y_dim
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.observation_space_low = observation_space_low
        self.observation_space_high = observation_space_high
        self.action_space_low = action_space_low
        self.action_space_high = action_space_high

        self.unroll_steps = unroll_steps
        self.no_samples = no_samples
        self.discount_factor = discount_factor
        self.random_matrices = random_matrices
        self.biases = biases
        self.basis_dims = basis_dims
        self.hidden_dim = hidden_dim
        self.learn_reward = learn_reward
        self.use_mean_reward = use_mean_reward
        self.update_hyperstate = update_hyperstate
        self.policy_use_hyperstate = policy_use_hyperstate
        self.learn_diff = learn_diff

        if self.environment == 'Pendulum-v0' and self.learn_reward == 0:
            #self.reward_function = real_env_pendulum_reward()
            self.reward_function = ANN(self.state_dim + self.action_dim, 1)
            self.placeholders_reward = [
                tf.placeholder(shape=v.shape, dtype=tf.float64)
                for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           self.reward_function.scope)
            ]
            self.assign_ops0 = [
                v.assign(pl) for v, pl in zip(
                    tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      self.reward_function.scope),
                    self.placeholders_reward)
            ]
        elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0:
            self.reward_function = mountain_car_continuous_reward_function()

        #self.hyperstate_dim = sum([(basis_dim*(basis_dim+1))/2 + basis_dim for basis_dim in self.basis_dims])
        self.hyperstate_dim = sum(
            [basis_dim * (basis_dim + 1) for basis_dim in self.basis_dims])

        self.random_projection_matrix = np.random.normal(
            loc=0.,
            scale=1. / np.sqrt(self.state_dim),
            size=[self.hyperstate_dim, self.state_dim])

        input_dim = self.state_dim
        if self.policy_use_hyperstate == 1:
            input_dim *= 2

        self.w1 = np.concatenate([
            np.random.normal(size=[input_dim, self.hidden_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim])
        ],
                                 axis=0)
        self.w2 = np.concatenate([
            np.random.normal(size=[self.hidden_dim, self.hidden_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim])
        ],
                                 axis=0)
        self.w3 = np.concatenate([
            np.random.normal(size=[self.hidden_dim, self.action_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.action_dim])
        ],
                                 axis=0)

        self.thetas = self._pack([self.w1, self.w2, self.w3])

        self.sizes = [[input_dim + 1, self.hidden_dim],
                      [self.hidden_dim + 1, self.hidden_dim],
                      [self.hidden_dim + 1, self.action_dim]]

        w1, w2, w3 = self._unpack(self.thetas, self.sizes)
        np.testing.assert_equal(w1, self.w1)
        np.testing.assert_equal(w2, self.w2)
        np.testing.assert_equal(w3, self.w3)

    def _pack(self, thetas):
        return np.concatenate([theta.flatten() for theta in thetas])

    def _unpack(self, thetas, sizes):
        sidx = 0
        weights = []
        for size in sizes:
            i, j = size
            w = thetas[sidx:sidx + i * j].reshape([i, j])
            sidx += i * j
            weights.append(w)
        return weights

    def _forward(self, thetas, X, hyperstate):
        #"Old" method of including hyperstate into policy network.
        '''
        w0, w1, w2, w3 = self._unpack(thetas, self.sizes)
        XXtr, Xytr = hyperstate

        A = [xx + noise for xx, noise in zip(XXtr, self.noises)]
        wn = [solve(a, xy) for a, xy in zip(A, Xytr)]

        indices = [np.triu_indices(basis_dim, 1) for basis_dim in self.basis_dims]
        hyperstate = []
        for i in range(len(X)):
            tmp0 = []
            for j in range(len(A)):
                A[j][i][indices[j]] = np.nan
                tmp1 = A[j][i]
                tmp0.append(tmp1[~np.isnan(tmp1)])
                tmp0.append(np.squeeze(wn[j][i]))
            tmp0 = np.concatenate(tmp0)
            hyperstate.append(tmp0)
        hyperstate = np.stack(hyperstate, axis=0)

        hyperstate = self._add_bias(hyperstate)
        hyperstate_embedding = np.tanh(np.matmul(hyperstate, w0))
        '''

        w1, w2, w3 = self._unpack(thetas, self.sizes)

        #Perform a simple random projection on the hyperstate.
        if self.policy_use_hyperstate == 1:
            hyperstate = np.concatenate([
                np.concatenate([
                    np.reshape(XXtr, [len(XXtr), -1]),
                    np.reshape(Xytr, [len(Xytr), -1])
                ],
                               axis=-1) for XXtr, Xytr in zip(*hyperstate)
            ],
                                        axis=-1)
            hyperstate = np.tanh(hyperstate / 50000.)
            hyperstate_embedding = np.matmul(hyperstate,
                                             self.random_projection_matrix)
            hyperstate_embedding = np.tanh(hyperstate_embedding)

            state_hyperstate = np.concatenate([X, hyperstate_embedding],
                                              axis=-1)
            policy_net_input = self._add_bias(state_hyperstate)
        else:
            policy_net_input = self._add_bias(X)

        h1 = np.tanh(np.matmul(policy_net_input, w1))
        h1 = self._add_bias(h1)

        h2 = np.tanh(np.matmul(h1, w2))
        h2 = self._add_bias(h2)

        out = np.tanh(np.matmul(h2, w3))
        out = out * self.action_space_high  #action bounds.

        return out

    def _add_bias(self, X):
        assert len(X.shape) == 2
        return np.concatenate([X, np.ones([len(X), 1])], axis=-1)

    def _relu(self, X):
        return np.maximum(X, 0.)

    def _fit(self, cma_maxiter, X, XXtr, Xytr, hyperparameters, sess):
        warnings.filterwarnings(
            'ignore',
            message=
            '.*scipy.linalg.solve\nIll-conditioned matrix detected. Result is not guaranteed to be accurate.\nReciprocal.*'
        )
        assert len(XXtr) == self.state_dim + self.learn_reward
        assert len(Xytr) == self.state_dim + self.learn_reward
        assert len(hyperparameters) == self.state_dim + self.learn_reward

        if self.use_mean_reward == 1:
            print 'Warning: use_mean_reward is set to True but this flag is not used by this function.'

        X = np.copy(X)
        XXtr = [np.copy(ele) for ele in XXtr]
        Xytr = [np.copy(ele) for ele in Xytr]
        hyperparameters = [np.copy(ele) for ele in hyperparameters]

        X = np.expand_dims(X, axis=1)
        X = np.tile(X, [1, self.no_samples, 1])
        X = np.reshape(X, [-1, self.state_dim])

        Llowers = [
            scipy.linalg.cholesky(
                (hp[-2] / hp[-1])**2 * np.eye(basis_dim) + XX, lower=True) for
            hp, basis_dim, XX in zip(hyperparameters, self.basis_dims, XXtr)
        ]
        Llowers = [
            np.tile(ele[np.newaxis, ...], [len(X), 1, 1]) for ele in Llowers
        ]
        XXtr = [np.tile(ele[np.newaxis, ...], [len(X), 1, 1]) for ele in XXtr]
        Xytr = [np.tile(ele[np.newaxis, ...], [len(X), 1, 1]) for ele in Xytr]

        self.noises = [
            (hp[2] / hp[3])**2 * np.eye(basis_dim)
            for hp, basis_dim in zip(hyperparameters, self.basis_dims)
        ]

        import cma
        options = {'maxiter': cma_maxiter, 'verb_disp': 1, 'verb_log': 0}
        print 'Before calling cma.fmin'
        res = cma.fmin(self._loss,
                       self.thetas,
                       2.,
                       args=(np.copy(X), [np.copy(ele) for ele in Llowers
                                          ], [np.copy(ele) for ele in XXtr],
                             [np.copy(ele) for ele in Xytr], None,
                             [np.copy(ele) for ele in hyperparameters], sess),
                       options=options)
        self.thetas = np.copy(res[0])

    def _predict(self, Llower, Xytr, basis, noise_sd):
        '''
        Llower = Llower[0]
        Xytr = Xytr[0]
        basis = np.squeeze(basis, axis=1)
        LinvXT = scipy.linalg.solve_triangular(Llower, basis.T, lower=True)
        pred_sigma = np.sum(np.square(LinvXT), axis=0)*noise_sd**2+noise_sd**2
        pred_sigma = pred_sigma[..., np.newaxis]
        tmp0 = scipy.linalg.solve_triangular(Llower, basis.T, lower=True).T
        tmp1 = scipy.linalg.solve_triangular(Llower, Xytr, lower=True)
        pred_mu = np.matmul(tmp0, tmp1)
        return pred_mu, pred_sigma
        '''

        #TODO:fix this.
        LinvXT = solve_triangular(Llower, np.transpose(basis, [0, 2, 1]))
        pred_sigma = np.sum(np.square(LinvXT),
                            axis=1) * noise_sd**2 + noise_sd**2
        tmp0 = np.transpose(
            solve_triangular(Llower, np.transpose(basis, [0, 2, 1])),
            [0, 2, 1])
        tmp1 = solve_triangular(Llower, Xytr)
        pred_mu = np.matmul(tmp0, tmp1)
        pred_mu = np.squeeze(pred_mu, axis=-1)
        return pred_mu, pred_sigma

    def _loss(self,
              thetas,
              X,
              Llowers,
              XXtr,
              Xytr,
              A=[],
              hyperparameters=None,
              sess=None):
        rng_state = np.random.get_state()
        X = np.copy(X)
        Llowers = [np.copy(ele) for ele in Llowers]
        XXtr = [np.copy(ele) for ele in XXtr]
        Xytr = [np.copy(ele) for ele in Xytr]
        hyperparameters = [np.copy(ele) for ele in hyperparameters]
        try:
            np.random.seed(2)

            rewards = []
            state = X
            for unroll_step in xrange(self.unroll_steps):
                action = self._forward(thetas,
                                       state,
                                       hyperstate=[Llowers, Xytr])
                reward, basis_reward = self._reward(state, action, sess,
                                                    Llowers[-1], Xytr[-1],
                                                    hyperparameters[-1])
                rewards.append((self.discount_factor**unroll_step) * reward)
                state_action = np.concatenate([state, action], axis=-1)

                means = []
                covs = []
                bases = []
                for i in xrange(self.state_dim):
                    length_scale, signal_sd, noise_sd, prior_sd = hyperparameters[
                        i]
                    basis = _basis(state_action, self.random_matrices[i],
                                   self.biases[i], self.basis_dims[i],
                                   length_scale, signal_sd)
                    basis = np.expand_dims(basis, axis=1)
                    bases.append(basis)
                    pred_mu, pred_sigma = self._predict(
                        Llowers[i], Xytr[i], basis, noise_sd)
                    means.append(pred_mu)
                    covs.append(pred_sigma)
                means = np.concatenate(means, axis=-1)
                covs = np.concatenate(covs, axis=-1)

                bases.append(basis_reward)

                state_ = np.stack([
                    np.random.multivariate_normal(mean=mean, cov=np.diag(cov))
                    for mean, cov in zip(means, covs)
                ],
                                  axis=0)
                state = state + state_ if self.learn_diff else state_
                if self.learn_diff == 0:
                    state_ = np.clip(state_, self.observation_space_low,
                                     self.observation_space_high)
                state = np.clip(state, self.observation_space_low,
                                self.observation_space_high)

                #                #Removable
                #                import copy
                #                Llowers2 = copy.deepcopy(Llowers)
                #                Xytr2 = copy.deepcopy(Xytr)
                #                XXtr2 = copy.deepcopy(XXtr)
                #                #Removable -END-

                if self.update_hyperstate == 1 or self.policy_use_hyperstate == 1:
                    y = np.concatenate([state_, reward],
                                       axis=-1)[..., :self.state_dim +
                                                self.learn_reward]
                    y = y[..., np.newaxis, np.newaxis]
                    for i in xrange(self.state_dim + self.learn_reward):
                        Llowers[i] = Llowers[i].transpose([0, 2, 1])
                    for i in xrange(self.state_dim + self.learn_reward):
                        for j in xrange(len(Llowers[i])):
                            cholupdate(Llowers[i][j], bases[i][j, 0].copy())
                        Xytr[i] += np.matmul(bases[i].transpose([0, 2, 1]),
                                             y[:, i, ...])


#                        #Removable
#                        _, _, noise_sd, prior_sd = hyperparameters[i]
#                        XXtr2[i], Xytr2[i], Llowers2[i] = self._update_hyperstate(XXtr2[i], XXtr2[i] + np.matmul(np.transpose(bases[i], [0, 2, 1]), bases[i]), Xytr2[i], Xytr2[i] + np.matmul(np.transpose(bases[i], [0, 2, 1]), y[:, i, ...]), Llowers2[i], (noise_sd/prior_sd)**2)
#                        print i
#                        print np.allclose(Llowers[i], Llowers2[i].transpose([0, 2, 1]))
#                        print np.allclose(Xytr[i], Xytr2[i])
#                        #Removable -END-

                    for i in xrange(self.state_dim + self.learn_reward):
                        Llowers[i] = Llowers[i].transpose([0, 2, 1])

            rewards = np.concatenate(rewards, axis=-1)
            rewards = np.sum(rewards, axis=-1)
            loss = -np.mean(rewards)
            np.random.set_state(rng_state)
            return loss
        except Exception as e:
            np.random.set_state(rng_state)
            print e, 'Returning 10e100'
            return 10e100

    def _update_hyperstate(self, XXold, XXnew, Xyold, Xynew, Llowerold,
                           var_ratio):
        var_diag = var_ratio * np.eye(XXnew.shape[-1])
        XX = []
        Xy = []
        Llower = []
        for i in range(len(XXnew)):
            try:
                tmp = scipy.linalg.cholesky(XXnew[i] + var_diag, lower=True)
                XX.append(XXnew[i].copy())
                Xy.append(Xynew[i].copy())
                Llower.append(tmp.copy())
            except Exception as e:
                XX.append(XXold[i].copy())
                Xy.append(Xyold[i].copy())
                Llower.append(Llowerold[i].copy())
        XX = np.stack(XX, axis=0)
        Xy = np.stack(Xy, axis=0)
        Llower = np.stack(Llower, axis=0)
        return XX, Xy, Llower

    def _reward(self, state, action, sess, Llower, Xy, hyperparameters):
        basis = None
        if self.environment == 'Pendulum-v0' and self.learn_reward == 0:
            reward = self.reward_function.build_np(sess, state, action)
        elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0:
            reward = self.reward_function.build_np(state, action)
        else:
            state_action = np.concatenate([state, action], axis=-1)
            length_scale, signal_sd, noise_sd, prior_sd = hyperparameters
            basis = _basis(state_action, self.random_matrices[-1],
                           self.biases[-1], self.basis_dims[-1], length_scale,
                           signal_sd)
            basis = np.expand_dims(basis, axis=1)
            pred_mu, pred_sigma = self._predict(Llower, Xy, basis, noise_sd)
            if self.use_mean_reward == 1:
                pred_sigma = np.zeros_like(pred_sigma)
            reward = np.stack([
                np.random.normal(loc=loc, scale=scale)
                for loc, scale in zip(pred_mu, pred_sigma)
            ],
                              axis=0)
        return reward, basis
Ejemplo n.º 4
0
class Agent:
    def __init__(self,
                 environment,
                 x_dim,
                 y_dim,
                 state_dim,
                 action_dim,
                 observation_space_low,
                 observation_space_high,
                 action_space_low,
                 action_space_high,
                 unroll_steps,
                 no_samples,
                 discount_factor,
                 random_matrix_state,
                 bias_state,
                 basis_dim_state,
                 random_matrix_reward,
                 bias_reward,
                 basis_dim_reward,
                 hidden_dim=32,
                 learn_reward=0,
                 use_mean_reward=0,
                 update_hyperstate=1,
                 policy_use_hyperstate=1,
                 learn_diff=0,
                 dump_model=0):
        #assert environment in ['Pendulum-v0', 'MountainCarContinuous-v0']
        assert x_dim == state_dim + action_dim
        assert len(action_space_low.shape) == 1
        np.testing.assert_equal(-action_space_low, action_space_high)
        self.environment = environment
        self.x_dim = x_dim
        self.y_dim = y_dim
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.observation_space_low = observation_space_low
        self.observation_space_high = observation_space_high
        self.action_space_low = action_space_low
        self.action_space_high = action_space_high

        self.unroll_steps = unroll_steps
        self.no_samples = no_samples
        self.discount_factor = discount_factor

        self.random_matrix_state = random_matrix_state
        self.bias_state = bias_state
        self.basis_dim_state = basis_dim_state
        self.random_matrix_reward = random_matrix_reward
        self.bias_reward = bias_reward
        self.basis_dim_reward = basis_dim_reward

        self.hidden_dim = hidden_dim
        self.learn_reward = learn_reward
        self.use_mean_reward = use_mean_reward
        self.update_hyperstate = update_hyperstate
        self.policy_use_hyperstate = policy_use_hyperstate
        self.learn_diff = learn_diff

        self.dump_model = dump_model

        self.uid = str(uuid.uuid4())
        self.epoch = 0

        if self.environment == 'Pendulum-v0' and self.learn_reward == 0:
            #self.reward_function = real_env_pendulum_reward()
            self.reward_function = ANN(self.state_dim + self.action_dim, 1)
            self.placeholders_reward = [
                tf.placeholder(shape=v.shape, dtype=tf.float64)
                for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           self.reward_function.scope)
            ]
            self.assign_ops0 = [
                v.assign(pl) for v, pl in zip(
                    tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      self.reward_function.scope),
                    self.placeholders_reward)
            ]
        elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0:
            self.reward_function = mountain_car_continuous_reward_function()

        self.hyperstate_dim = self.basis_dim_state * (self.basis_dim_state +
                                                      self.state_dim)
        if self.learn_reward == 1:
            self.hyperstate_dim += self.basis_dim_reward * (
                self.basis_dim_reward + 1)

        self.random_projection_matrix = np.random.normal(
            loc=0.,
            scale=1. / np.sqrt(self.state_dim),
            size=[self.hyperstate_dim, self.state_dim])

        input_dim = self.state_dim
        if self.policy_use_hyperstate == 1:
            input_dim *= 2

        self.w1 = np.concatenate([
            np.random.normal(size=[input_dim, self.hidden_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim])
        ],
                                 axis=0)
        self.w2 = np.concatenate([
            np.random.normal(size=[self.hidden_dim, self.hidden_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim])
        ],
                                 axis=0)
        self.w3 = np.concatenate([
            np.random.normal(size=[self.hidden_dim, self.action_dim]),
            np.random.uniform(-3e-3, 3e-3, size=[1, self.action_dim])
        ],
                                 axis=0)

        self.thetas = self._pack([self.w1, self.w2, self.w3])

        self.sizes = [[input_dim + 1, self.hidden_dim],
                      [self.hidden_dim + 1, self.hidden_dim],
                      [self.hidden_dim + 1, self.action_dim]]

        w1, w2, w3 = self._unpack(self.thetas, self.sizes)
        np.testing.assert_equal(w1, self.w1)
        np.testing.assert_equal(w2, self.w2)
        np.testing.assert_equal(w3, self.w3)

    def _pack(self, thetas):
        return np.concatenate([theta.flatten() for theta in thetas])

    def _unpack(self, thetas, sizes):
        sidx = 0
        weights = []
        for size in sizes:
            i, j = size
            w = thetas[sidx:sidx + i * j].reshape([i, j])
            sidx += i * j
            weights.append(w)
        return weights

    def _forward(self, thetas, X, hyperstate_params):
        w1, w2, w3 = self._unpack(thetas, self.sizes)

        #Perform a simple random projection on the hyperstate.
        if self.policy_use_hyperstate == 1:
            Llower_state, Xytr_state, Llower_reward, Xytr_reward = hyperstate_params
            hyperstate = np.concatenate([
                Llower_state.reshape([len(Llower_state), -1]),
                Xytr_state.reshape([len(Xytr_state), -1]),
                Llower_reward.reshape([len(Llower_reward), -1]),
                Xytr_reward.reshape([len(Xytr_reward), -1])
            ],
                                        axis=-1)
            hyperstate = np.tanh(hyperstate / 50000.)
            hyperstate_embedding = np.matmul(hyperstate,
                                             self.random_projection_matrix)
            hyperstate_embedding = np.tanh(hyperstate_embedding)

            state_hyperstate = np.concatenate([X, hyperstate_embedding],
                                              axis=-1)
            policy_net_input = self._add_bias(state_hyperstate)
        else:
            policy_net_input = self._add_bias(X)

        h1 = np.tanh(np.matmul(policy_net_input, w1))
        h1 = self._add_bias(h1)

        h2 = np.tanh(np.matmul(h1, w2))
        h2 = self._add_bias(h2)

        out = np.tanh(np.matmul(h2, w3))
        out = out * self.action_space_high  #action bounds.

        return out

    def _add_bias(self, X):
        assert len(X.shape) == 2
        return np.concatenate([X, np.ones([len(X), 1])], axis=-1)

    def _relu(self, X):
        return np.maximum(X, 0.)

    def _fit(self, cma_maxiter, X, XXtr_state, Xytr_state,
             hyperparameters_state, XXtr_reward, Xytr_reward,
             hyperparameters_reward, sess):
        warnings.filterwarnings(
            'ignore',
            message=
            '.*scipy.linalg.solve\nIll-conditioned matrix detected. Result is not guaranteed to be accurate.\nReciprocal.*'
        )
        assert XXtr_state.shape == (self.basis_dim_state, self.basis_dim_state)
        assert Xytr_state.shape == (self.basis_dim_state, self.state_dim)
        assert XXtr_reward.shape == (self.basis_dim_reward,
                                     self.basis_dim_reward)
        assert Xytr_reward.shape == (self.basis_dim_reward, 1)
        assert hyperparameters_state.shape == hyperparameters_reward.shape

        if self.use_mean_reward == 1:
            print(
                'Warning: use_mean_reward is set to True but this flag is not used by this function.'
            )

        #Copy the arrays (just to be safe no overwriting occurs).
        X = X.copy()
        XXtr_state = XXtr_state.copy()
        Xytr_state = Xytr_state.copy()
        hyperparameters_state = hyperparameters_state.copy()
        XXtr_reward = XXtr_reward.copy()
        Xytr_reward = Xytr_reward.copy()
        hyperparameters_reward = hyperparameters_reward.copy()

        X = np.expand_dims(X, axis=1)
        X = np.tile(X, [1, self.no_samples, 1])
        X = np.reshape(X, [-1, self.state_dim])

        #State
        Llower_state = spla.cholesky(
            (hyperparameters_state[-2] / hyperparameters_state[-1])**2 *
            np.eye(self.basis_dim_state) + XXtr_state,
            lower=True)
        Llower_state = np.tile(Llower_state, [len(X), 1, 1])

        XXtr_state = np.tile(XXtr_state, [len(X), 1, 1])
        Xytr_state = np.tile(Xytr_state, [len(X), 1, 1])

        #Reward
        if self.learn_reward:
            Llower_reward = spla.cholesky(
                (hyperparameters_reward[-2] / hyperparameters_reward[-1])**2 *
                np.eye(self.basis_dim_reward) + XXtr_reward,
                lower=True)
            Llower_reward = np.tile(Llower_reward, [len(X), 1, 1])

            XXtr_reward = np.tile(XXtr_reward, [len(X), 1, 1])
            Xytr_reward = np.tile(Xytr_reward, [len(X), 1, 1])

        import cma
        options = {'maxiter': cma_maxiter, 'verb_disp': 1, 'verb_log': 0}
        print('Before calling cma.fmin')
        res = cma.fmin(
            self._loss,
            self.thetas,
            2.,
            args=(X.copy(), Llower_state.copy(), XXtr_state.copy(),
                  Xytr_state.copy(), hyperparameters_state,
                  Llower_reward.copy() if self.learn_reward else None,
                  XXtr_reward.copy() if self.learn_reward else None,
                  Xytr_reward.copy() if self.learn_reward else None,
                  hyperparameters_reward if self.learn_reward else None, sess),
            options=options)
        self.thetas = np.copy(res[0])
        if self.dump_model:
            print('Unique identifier:', self.uid)
            directory = './models/'
            if not os.path.exists(directory):
                os.makedirs(directory)
            with open(
                    directory + self.uid + '_epoch:' + str(self.epoch) + '.p',
                    'wb') as fp:
                pickle.dump(self.thetas, fp)
            self.epoch += 1

    def _predict(self, Llower, Xytr, basis, noise_sd):
        #TODO: fix this.
        LinvXT = solve_triangular(Llower, basis.transpose([0, 2, 1]))
        sigma = np.sum(np.square(LinvXT), axis=1) * noise_sd**2 + noise_sd**2
        tmp0 = solve_triangular(Llower,
                                basis.transpose([0, 2,
                                                 1])).transpose([0, 2, 1])
        tmp1 = solve_triangular(Llower, Xytr)
        mu = np.matmul(tmp0, tmp1).squeeze(axis=1)
        return mu, sigma

    def _loss(self,
              thetas,
              X,
              Llower_state,
              XXtr_state,
              Xytr_state,
              hyperparameters_state,
              Llower_reward,
              XXtr_reward,
              Xytr_reward,
              hyperparameters_reward,
              sess=None):
        X = X.copy()
        Llower_state = Llower_state.copy()
        XXtr_state = XXtr_state.copy()
        Xytr_state = Xytr_state.copy()
        hyperparameters_state = hyperparameters_state.copy()
        if self.learn_reward:
            Llower_reward = Llower_reward.copy()
            XXtr_reward = XXtr_reward.copy()
            Xytr_reward = Xytr_reward.copy()
            hyperparameters_reward = hyperparameters_reward.copy()
        rng_state = np.random.get_state()
        #try:
        np.random.seed(2)

        rewards = []
        state = X
        for unroll_step in xrange(self.unroll_steps):
            action = self._forward(thetas,
                                   state,
                                   hyperstate_params=[
                                       Llower_state, Xytr_state, Llower_reward,
                                       Xytr_reward
                                   ])
            state_action = np.concatenate([state, action], axis=-1)

            reward, basis_reward = self._reward(state, action, state_action,
                                                sess, Llower_reward,
                                                Xytr_reward,
                                                hyperparameters_reward)
            rewards.append((self.discount_factor**unroll_step) * reward)

            length_scale, signal_sd, noise_sd, prior_sd = hyperparameters_state
            basis_state = _basis(state_action, self.random_matrix_state,
                                 self.bias_state, self.basis_dim_state,
                                 length_scale, signal_sd)
            basis_state = basis_state[:, None, ...]
            mu, sigma = self._predict(Llower_state, Xytr_state, basis_state,
                                      noise_sd)
            state_ = mu + np.sqrt(sigma) * np.random.standard_normal(
                size=mu.shape)

            if self.learn_diff:
                state_tmp = state.copy()
                state = np.clip(state + state_, self.observation_space_low,
                                self.observation_space_high)
                state_ = state - state_tmp
            else:
                state_ = np.clip(state_, self.observation_space_low,
                                 self.observation_space_high)
                state = state_.copy()

            if self.update_hyperstate == 1 or self.policy_use_hyperstate == 1:
                #Update state hyperstate
                Llower_state = Llower_state.transpose([0, 2, 1])
                for i in range(len(Llower_state)):
                    cholupdate(Llower_state[i], basis_state[i, 0].copy())
                Llower_state = Llower_state.transpose([0, 2, 1])
                Xytr_state += np.matmul(basis_state.transpose([0, 2, 1]),
                                        state_[..., None, :])

                #Update reward hyperstate
                if self.learn_reward:
                    Llower_reward = Llower_reward.transpose([0, 2, 1])
                    for i in range(len(Llower_reward)):
                        cholupdate(Llower_reward[i], basis_reward[i, 0].copy())
                    Llower_reward = Llower_reward.transpose([0, 2, 1])
                Xytr_reward += np.matmul(basis_reward.transpose([0, 2, 1]),
                                         reward[..., None, :])

        rewards = np.concatenate(rewards, axis=-1)
        rewards = np.sum(rewards, axis=-1)
        loss = -np.mean(rewards)
        np.random.set_state(rng_state)
        return loss
        #except Exception as e:
        #np.random.set_state(rng_state)
        #print e, 'Returning 10e100'
        #return 10e100

    def _reward(self, state, action, state_action, sess, Llower, Xy,
                hyperparameters):
        basis = None
        if self.environment == 'Pendulum-v0' and self.learn_reward == 0:
            reward = self.reward_function.build_np(sess, state, action)
        elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0:
            reward = self.reward_function.build_np(state, action)
        else:
            #state_action = np.concatenate([state, action], axis=-1)
            length_scale, signal_sd, noise_sd, prior_sd = hyperparameters
            basis = _basis(state_action, self.random_matrix_reward,
                           self.bias_reward, self.basis_dim_reward,
                           length_scale, signal_sd)
            basis = basis[:, None, ...]
            mu, sigma = self._predict(Llower, Xy, basis, noise_sd)
            if self.use_mean_reward == 1: sigma = np.zeros_like(sigma)
            reward = mu + np.sqrt(sigma) * np.random.standard_normal(
                size=mu.shape)
        return reward, basis
Ejemplo n.º 5
0
class blr_model:
    def __init__(self, x_dim, y_dim, state_dim, action_dim,
                 observation_space_low, observation_space_high,
                 action_bound_low, action_bound_high, unroll_steps, no_samples,
                 no_basis, discount_factor, train_policy_batch_size,
                 train_policy_iterations, hyperparameters, debugging_plot):

        assert x_dim == state_dim + action_dim
        assert len(hyperparameters) == y_dim
        self.x_dim = x_dim
        self.y_dim = y_dim
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.observation_space_low = observation_space_low
        self.observation_space_high = observation_space_high
        self.action_bound_low = action_bound_low
        self.action_bound_high = action_bound_high

        self.unroll_steps = unroll_steps
        self.no_samples = no_samples
        self.no_basis = no_basis
        self.discount_factor = discount_factor

        self.train_policy_batch_size = train_policy_batch_size
        self.train_policy_iterations = train_policy_iterations

        self.hyperparameters = hyperparameters
        self.debugging_plot = debugging_plot

        self.policy_scope = 'policy_scope'
        self.policy_reuse_vars = None

        self.models = [
            bayesian_model(self.x_dim, self.observation_space_low,
                           self.observation_space_high, self.action_bound_low,
                           self.action_bound_high, self.no_basis,
                           *self.hyperparameters[i]) for i in range(self.y_dim)
        ]

        self.states = tf.placeholder(shape=[None, self.state_dim],
                                     dtype=tf.float64)
        self.batch_size = tf.shape(self.states)[0]
        #self.batch_size = 3
        self.actions = self.build_policy(self.states)

        self.cum_xx = [
            tf.tile(tf.expand_dims(model.cum_xx_pl, axis=0),
                    [self.batch_size * self.no_samples, 1, 1])
            for model in self.models
        ]
        self.cum_xy = [
            tf.tile(tf.expand_dims(model.cum_xy_pl, axis=0),
                    [self.batch_size * self.no_samples, 1, 1])
            for model in self.models
        ]
        self.unroll(self.states)
        #self.unroll2(self.states)

    #TODO: for debugging purposes
    def unroll2(self, seed_states):
        assert seed_states.shape.as_list() == [None, self.state_dim]
        no_samples = self.no_samples
        unroll_steps = self.unroll_steps
        #self.reward_model = real_env_pendulum_reward()#Use true model.
        self.reward_model = ANN(self.state_dim + self.action_dim, 1)
        self.placeholders_reward = [
            tf.placeholder(shape=v.shape, dtype=tf.float64)
            for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.reward_model.scope)
        ]
        self.assign_ops = [
            v.assign(pl) for v, pl in zip(
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                  self.reward_model.scope),
                self.placeholders_reward)
        ]

        states = tf.expand_dims(seed_states, axis=1)
        states = tf.tile(states, [1, no_samples, 1])
        states = tf.reshape(states, shape=[-1, self.state_dim])

        costs = []
        self.next_states = []
        for unroll_step in range(unroll_steps):
            actions = self.build_policy(states)

            rewards = (self.discount_factor**
                       unroll_step) * self.reward_model.build(states, actions)
            rewards = tf.reshape(tf.squeeze(rewards, axis=-1),
                                 shape=[-1, no_samples])
            costs.append(-rewards)

            states_actions = tf.concat([states, actions], axis=-1)

            next_states = self.get_next_states2(states_actions)
            self.next_states.append(next_states)
            states = next_states

        costs = tf.stack(costs, axis=-1)
        self.loss = tf.reduce_mean(
            tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1))
        self.opt = tf.train.AdamOptimizer().minimize(
            self.loss,
            var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       'policy_scope'))

    #TODO: for debugging purposes
    def get_next_states(self, states_actions):
        self.string = 'unroll2_gns'
        mu, sigma = [
            tf.concat(e, axis=-1) for e in zip(*[
                model.posterior_predictive_distribution(states_actions, None)
                for model in self.models
            ])
        ]
        self.mus1.append(mu)
        self.sigmas1.append(sigma)
        #print mu.shape
        #print sigma.shape
        next_state = tfd.MultivariateNormalDiag(
            loc=mu, scale_diag=tf.sqrt(sigma)).sample()
        return next_state

    #TODO: for debugging purposes
    def get_next_states2(self, states_actions):
        self.string = 'unroll2_gns2'
        mus = []
        sigmas = []
        for model in self.models:
            mu, sigma = model.mu_sigma(model.cum_xx_pl, model.cum_xy_pl)
            post_pred_mu, post_pred_sigma = model.post_pred2(
                states_actions, mu, sigma)

            mus.append(post_pred_mu)
            sigmas.append(post_pred_sigma)
        mus = tf.concat(mus, axis=-1)
        sigmas = tf.concat(sigmas, axis=-1)
        self.mus2.append(mus)
        self.sigmas2.append(sigmas)
        #print mus.shape
        #print sigmas.shape
        next_state = tfd.MultivariateNormalDiag(
            loc=mus, scale_diag=tf.sqrt(sigmas)).sample()
        return next_state

    def unroll(self, seed_states):
        assert seed_states.shape.as_list() == [None, self.state_dim]
        no_samples = self.no_samples
        unroll_steps = self.unroll_steps
        #self.reward_model = real_env_pendulum_reward()#Use true model.
        self.reward_model = ANN(self.state_dim + self.action_dim, 1)
        self.placeholders_reward = [
            tf.placeholder(shape=v.shape, dtype=tf.float64)
            for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.reward_model.scope)
        ]
        self.assign_ops = [
            v.assign(pl) for v, pl in zip(
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                  self.reward_model.scope),
                self.placeholders_reward)
        ]

        states = tf.expand_dims(seed_states, axis=1)
        states = tf.tile(states, [1, no_samples, 1])
        states = tf.reshape(states, shape=[-1, self.state_dim])

        self.mus0 = []
        self.sigmas0 = []
        self.mus1 = []
        self.sigmas1 = []
        self.mus2 = []
        self.sigmas2 = []

        costs = []
        self.next_states = []
        #ns = []
        #bs = []
        for unroll_step in range(unroll_steps):
            print 'unrolling:', unroll_step
            if self.debugging_plot == True:
                actions = self.build_policy2(states)
            else:
                actions = self.build_policy(states)

            # Reward
            rewards = (self.discount_factor**
                       unroll_step) * self.reward_model.build(states, actions)
            rewards = tf.reshape(tf.squeeze(rewards, axis=-1),
                                 shape=[-1, no_samples])
            costs.append(-rewards)

            states_actions = tf.concat([states, actions], axis=-1)
            mus, sigmas = zip(*[
                self.mu_sigma(self.cum_xx[y], self.cum_xy[y], self.models[y].s,
                              self.models[y].noise_sd)
                for y in range(self.y_dim)
            ])

            bases = [
                model.approx_rbf_kern_basis(states_actions)
                for model in self.models
            ]
            #bs.append(bases)
            mu_pred, sigma_pred = [
                tf.concat(e, axis=-1) for e in zip(*[
                    self.prediction(mu, sigma, basis, model.noise_sd) for mu,
                    sigma, basis, model in zip(mus, sigmas, bases, self.models)
                ])
            ]

            self.mus0.append(mu_pred)
            self.sigmas0.append(sigma_pred)
            self.get_next_states(states_actions)
            self.get_next_states2(states_actions)

            next_states = tfd.MultivariateNormalDiag(
                loc=mu_pred, scale_diag=tf.sqrt(sigma_pred)).sample()
            #ns.append(tf.split(next_states, self.y_dim, axis=-1))

            self.next_states.append(
                tf.reshape(next_states, shape=[-1, no_samples,
                                               self.state_dim]))

            for y in range(self.y_dim):
                self.update_posterior(bases[y], next_states[..., y:y + 1], y)

            states = next_states

        if self.debugging_plot == False:
            print 'here1'
            costs = tf.stack(costs, axis=-1)
            print 'here2'
            self.loss = tf.reduce_mean(
                tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1))
            print 'here3'
            self.opt = tf.train.AdamOptimizer().minimize(
                self.loss,
                var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           'policy_scope'))
            print 'here4'
        self.string = 'unroll'

    def update_posterior(self, X, y, i):
        X_expanded_dims = tf.expand_dims(X, axis=-1)
        y_expanded_dims = tf.expand_dims(y, axis=-1)
        self.cum_xx[i] += tf.matmul(
            X_expanded_dims, tf.transpose(X_expanded_dims, perm=[0, 2, 1]))
        self.cum_xy[i] += tf.matmul(X_expanded_dims, y_expanded_dims)

    def prediction(self, mu, sigma, basis, noise_sd):
        basis_expanded_dims = tf.expand_dims(basis, axis=-1)
        mu_pred = tf.matmul(tf.transpose(mu, perm=[0, 2, 1]),
                            basis_expanded_dims)
        sigma_pred = tf.square(noise_sd) + tf.matmul(
            tf.matmul(tf.transpose(basis_expanded_dims, perm=[0, 2, 1]),
                      sigma), basis_expanded_dims)

        return tf.squeeze(mu_pred, axis=-1), tf.squeeze(sigma_pred, axis=-1)

    def mu_sigma(self, xx, xy, s, noise_sd):
        noise_sd_sq = tf.square(noise_sd)
        prior_sigma_inv = tf.matrix_inverse(
            tf.tile(
                tf.expand_dims(s * tf.eye(self.no_basis, dtype=tf.float64),
                               axis=0),
                [self.batch_size * self.no_samples, 1, 1]))
        A = tf.matrix_inverse(tf.multiply(noise_sd_sq, prior_sigma_inv) + xx)
        sigma = tf.multiply(noise_sd_sq, A)
        # Assuming that prior mean is zero vector
        mu = tf.matmul(A, xy)
        return mu, sigma

    def mu_sigma2(self, xx, xy, s, noise_sd, bs, ns, idx):
        if bs and ns:
            assert len(zip(*bs)) == self.y_dim
            assert len(zip(*ns)) == self.y_dim
            X = zip(*bs)[idx]
            y = zip(*ns)[idx]

            X = tf.expand_dims(tf.stack(X, axis=0), axis=-1)
            XX = tf.matmul(X, tf.transpose(X, perm=[0, 1, 3, 2]))

            y = tf.expand_dims(tf.stack(y, axis=0), axis=-1)
            Xy = tf.matmul(X, y)

            XX_ = tf.reduce_sum(XX, axis=0)
            Xy_ = tf.reduce_sum(Xy, axis=0)

        else:
            XX_ = 0.
            Xy_ = 0.

        noise_sd_sq = tf.square(noise_sd)
        prior_sigma_inv = tf.matrix_inverse(
            tf.tile(
                tf.expand_dims(s * tf.eye(self.no_basis, dtype=tf.float64),
                               axis=0),
                [self.batch_size * self.no_samples, 1, 1]))
        A = tf.matrix_inverse(
            tf.multiply(noise_sd_sq, prior_sigma_inv) + xx + XX_)
        sigma = tf.multiply(noise_sd_sq, A)
        # Assuming that prior mean is zero vector
        mu = tf.matmul(A, xy + Xy_)
        return mu, sigma

    def update(self, sess, X=None, y=None, memory=None):
        if memory is not None:
            states = np.stack([e[0] for e in memory], axis=0)
            actions = np.stack([e[1] for e in memory], axis=0)
            y = np.stack([e[3] for e in memory], axis=0)
            X = np.concatenate([states, actions], axis=-1)

        for i in range(self.y_dim):
            self.models[i].update(sess, X, y[..., i])

    def act(self, sess, state):
        state = np.atleast_2d(state)
        action = sess.run(self.actions, feed_dict={self.states: state})
        return action[0]

    def train(self, sess, memory):
        feed_dict = {}
        #TODO: for debugging purposes
        if self.string == 'unroll':
            for model in self.models:
                feed_dict[model.cum_xx_pl] = model.cum_xx
                feed_dict[model.cum_xy_pl] = model.cum_xy
                feed_dict[model.mu_placeholder] = model.mu  #for testing
                feed_dict[model.sigma_placeholder] = model.sigma  #for testing
                feed_dict[
                    model.sigma_prior_pl] = model.sigma_prior  #for testing
                feed_dict[model.mu_prior_pl] = model.mu_prior  #for testing
        elif self.string == 'unroll2_gns':
            for model in self.models:
                feed_dict[model.mu_placeholder] = model.mu
                feed_dict[model.sigma_placeholder] = model.sigma
        elif self.string == 'unroll2_gns2':
            for model in self.models:
                feed_dict[model.cum_xx_pl] = model.cum_xx
                feed_dict[model.cum_xy_pl] = model.cum_xy
                feed_dict[model.sigma_prior_pl] = model.sigma_prior
                feed_dict[model.mu_prior_pl] = model.mu_prior

        for it in range(self.train_policy_iterations):
            batch = memory.sample(self.train_policy_batch_size)
            states = np.stack([b[0] for b in batch], axis=0)
            feed_dict[self.states] = states

            mus0, sigmas0, mus1, sigmas1, mus2, sigmas2, next_states, loss, _ = sess.run(
                [
                    self.mus0, self.sigmas0, self.mus1, self.sigmas1,
                    self.mus2, self.sigmas2, self.next_states, self.loss,
                    self.opt
                ],
                feed_dict=feed_dict)
            if loss > 1000.:
                print next_states
            '''
            assert len(mus0) == len(sigmas0)
            assert len(mus0) == len(mus1)
            assert len(mus0) == len(sigmas1)
            assert len(mus0) == len(mus2)
            assert len(mus0) == len(sigmas2)
            '''
            '''
            for mu0, sigma0, mu1, sigma1, mu2, sigma2, ii in zip(mus0, sigmas0, mus1, sigmas1, mus2, sigmas2, range(len(mus0))):
                try:
                    np.testing.assert_almost_equal(sigma1, sigma2, decimal=4)
                except:
                    print ii, 'here0'
                    for i in range(len(sigma1)):
                        for j in range(len(sigma1[i])):
                            print sigma1[i, j], sigma2[i, j]
                    exit()
                try:
                    np.testing.assert_almost_equal(mu1, mu2, decimal=4)
                except:
                    print ii, 'here3',
                    for i in range(len(mu1)):
                        print mu1[i], mu2[i]
                    exit()
                try:
                    np.testing.assert_almost_equal(mu0, mu1, decimal=4)
                except:
                    print ii, 'here1',
                    for i in range(len(mu0)):
                        print mu0[i], mu1[i]
                    exit()
                try:
                    np.testing.assert_almost_equal(mu0, mu2, decimal=4)
                except:
                    print ii, 'here2',
                    for i in range(len(m0)):
                        print m0[i], m2[i]
                    exit()
                try:
                    np.testing.assert_almost_equal(sigma0, sigma1, decimal=4)
                except:
                    print ii, 'here4',
                    for i in range(len(sigma0)):
                        for j in range(len(sigma0[i])):
                            print sigma0[i, j], sigma1[i, j]
                    exit()
                try:
                    np.testing.assert_almost_equal(sigma0, sigma2, decimal=4)
                except:
                    print ii, 'here5',
                    for i in range(len(sigma0)):
                        for j in range(len(sigma0[i])):
                            print sigma0[i, j], sigma2[i, j]
                    exit()
            '''
            print 'iteration:', it, 'loss:', loss, self.string, len(mus0)
            '''
            try:
                mus0, sigmas0, mus1, sigmas1, mus2, sigmas2, next_states, loss, _ = sess.run([self.mus0, self.sigmas0, self.mus1, self.sigmas1, self.mus2, self.sigmas2, self.next_states, self.loss, self.opt], feed_dict=feed_dict)
                assert len(mus0) == len(sigmas0)
                assert len(mus0) == len(mus1)
                assert len(mus0) == len(sigmas1)
                assert len(mus0) == len(mus2)
                assert len(mus0) == len(sigmas2)
                for mu0, sigma0, mu1, sigma1, mu2, sigma2 in zip(mus0, sigmas0, mus1, sigmas1, mus2, sigmas2):
                    np.testing.assert_almost_equal(mu0, mu1)
                    np.testing.assert_almost_equal(mu0, mu2)
                    np.testing.assert_almost_equal(mu1, mu2)
                    np.testing.assert_almost_equal(sigma0, sigma1)
                    np.testing.assert_almost_equal(sigma0, sigma2)
                    np.testing.assert_almost_equal(sigma1, sigma2)
                if loss > 1000.:
                    print next_states
                print 'iteration:', it, 'loss:', loss, self.string
            except:
                print 'training step failed.'
            '''

    def build_policy(self, states):
        assert states.shape.as_list() == [None, self.state_dim]

        #Fully connected layer 1
        fc1 = slim.fully_connected(states,
                                   256,
                                   activation_fn=tf.nn.relu,
                                   scope=self.policy_scope + '/fc1',
                                   reuse=self.policy_reuse_vars)

        #Fully connected layer 2
        fc2 = slim.fully_connected(fc1,
                                   256,
                                   activation_fn=tf.nn.relu,
                                   scope=self.policy_scope + '/fc2',
                                   reuse=self.policy_reuse_vars)

        #Output layer
        output = slim.fully_connected(fc2,
                                      self.action_dim,
                                      activation_fn=tf.nn.tanh,
                                      scope=self.policy_scope + '/output',
                                      reuse=self.policy_reuse_vars)

        #Apply action bounds
        np.testing.assert_array_equal(-self.action_bound_low,
                                      self.action_bound_high)
        action_bound = tf.constant(self.action_bound_high, dtype=tf.float64)
        policy = tf.multiply(output, action_bound)

        #Change flag
        self.policy_reuse_vars = True

        return policy

    def build_policy2(self, states):
        try:
            self.policy
        except:
            self.idx = 0
            self.policy = tf.placeholder(shape=[self.unroll_steps, 1],
                                         dtype=tf.float64)

        action = self.policy[self.idx:self.idx + 1, ...]
        tile_size = tf.shape(states)[0]

        action_tiled = tf.tile(action, [tile_size, 1])
        self.idx += 1

        return action_tiled
Ejemplo n.º 6
0
    def unroll(self, seed_states):
        assert seed_states.shape.as_list() == [None, self.state_dim]
        no_samples = self.no_samples
        unroll_steps = self.unroll_steps
        #self.reward_model = real_env_pendulum_reward()#Use true model.
        self.reward_model = ANN(self.state_dim + self.action_dim, 1)
        self.placeholders_reward = [
            tf.placeholder(shape=v.shape, dtype=tf.float64)
            for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.reward_model.scope)
        ]
        self.assign_ops = [
            v.assign(pl) for v, pl in zip(
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                  self.reward_model.scope),
                self.placeholders_reward)
        ]

        states = tf.expand_dims(seed_states, axis=1)
        states = tf.tile(states, [1, no_samples, 1])
        states = tf.reshape(states, shape=[-1, self.state_dim])

        self.mus0 = []
        self.sigmas0 = []
        self.mus1 = []
        self.sigmas1 = []
        self.mus2 = []
        self.sigmas2 = []

        costs = []
        self.next_states = []
        #ns = []
        #bs = []
        for unroll_step in range(unroll_steps):
            print 'unrolling:', unroll_step
            if self.debugging_plot == True:
                actions = self.build_policy2(states)
            else:
                actions = self.build_policy(states)

            # Reward
            rewards = (self.discount_factor**
                       unroll_step) * self.reward_model.build(states, actions)
            rewards = tf.reshape(tf.squeeze(rewards, axis=-1),
                                 shape=[-1, no_samples])
            costs.append(-rewards)

            states_actions = tf.concat([states, actions], axis=-1)
            mus, sigmas = zip(*[
                self.mu_sigma(self.cum_xx[y], self.cum_xy[y], self.models[y].s,
                              self.models[y].noise_sd)
                for y in range(self.y_dim)
            ])

            bases = [
                model.approx_rbf_kern_basis(states_actions)
                for model in self.models
            ]
            #bs.append(bases)
            mu_pred, sigma_pred = [
                tf.concat(e, axis=-1) for e in zip(*[
                    self.prediction(mu, sigma, basis, model.noise_sd) for mu,
                    sigma, basis, model in zip(mus, sigmas, bases, self.models)
                ])
            ]

            self.mus0.append(mu_pred)
            self.sigmas0.append(sigma_pred)
            self.get_next_states(states_actions)
            self.get_next_states2(states_actions)

            next_states = tfd.MultivariateNormalDiag(
                loc=mu_pred, scale_diag=tf.sqrt(sigma_pred)).sample()
            #ns.append(tf.split(next_states, self.y_dim, axis=-1))

            self.next_states.append(
                tf.reshape(next_states, shape=[-1, no_samples,
                                               self.state_dim]))

            for y in range(self.y_dim):
                self.update_posterior(bases[y], next_states[..., y:y + 1], y)

            states = next_states

        if self.debugging_plot == False:
            print 'here1'
            costs = tf.stack(costs, axis=-1)
            print 'here2'
            self.loss = tf.reduce_mean(
                tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1))
            print 'here3'
            self.opt = tf.train.AdamOptimizer().minimize(
                self.loss,
                var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           'policy_scope'))
            print 'here4'
        self.string = 'unroll'
Ejemplo n.º 7
0
    def __init__(self, state_dim, action_dim, action_bound_high, \
                 action_bound_low, unroll_length, discount_factor, \
                 gradient_descent_steps, scope):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound_high = action_bound_high
        self.action_bound_low = action_bound_low
        self.unroll_length = unroll_length
        self.discount_factor = discount_factor
        self.gradient_descent_steps = gradient_descent_steps
        self.scope = scope

        #Make sure bounds are same (assumption can be relaxed later)
        np.testing.assert_array_equal(-self.action_bound_low,
                                      self.action_bound_high)

        #Flags
        self.policy_reuse_vars = None
        '''
        self.reward_model = ANN(self.state_dim+self.action_dim, 1)
        self.placeholders_reward = [tf.placeholder(shape=v.shape, dtype=tf.float64)
                                    for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope)]
        self.assign_ops0 = [v.assign(pl) for v, pl in zip(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope),
                            self.placeholders_reward)]
        '''
        #self.reward_model = real_env_pendulum_reward()
        self.reward_model = mountain_car_continuous_reward_function()

        #self.state_model = real_env_pendulum_state()
        #self.state_model = mountain_car_continuous_state_function()
        self.state_model = ANN(self.state_dim + self.action_dim,
                               self.state_dim)
        self.placeholders_state = [
            tf.placeholder(shape=v.shape, dtype=tf.float64)
            for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.state_model.scope)
        ]
        self.assign_ops1 = [
            v.assign(pl) for v, pl in zip(
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.
                                  state_model.scope), self.placeholders_state)
        ]

        #Build computational graph (i.e., unroll policy)
        #self.states = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float32)
        self.states = tf.placeholder(shape=[None, self.state_dim],
                                     dtype=tf.float64)

        self.action = self.build_policy(self.states)
        state = self.states
        action = self.build_policy(state)
        rewards = []
        for i in range(self.unroll_length):
            print i
            #reward = pow(self.discount_factor, i) * self.reward_model.build(state, action)
            #reward = pow(self.discount_factor, i) * self.reward_model.step_tf(state, action)
            reward = pow(self.discount_factor,
                         i) * self.reward_model.sigmoid_approx(state, action)
            rewards.append(reward)
            state = self.state_model.build(state, action)
            #state = self.state_model.step_tf(state, action)
            action = self.build_policy(state)

        rewards = tf.reduce_sum(tf.stack(rewards, axis=-1), axis=-1)
        print 'here0'
        self.loss = -tf.reduce_mean(tf.reduce_sum(rewards, axis=-1))
        print 'here1'
        self.opt = tf.train.AdamOptimizer().minimize(
            self.loss,
            var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.scope))
        print 'here2'
Ejemplo n.º 8
0
class direct_policy_search:
    def __init__(self, state_dim, action_dim, action_bound_high, \
                 action_bound_low, unroll_length, discount_factor, \
                 gradient_descent_steps, scope):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound_high = action_bound_high
        self.action_bound_low = action_bound_low
        self.unroll_length = unroll_length
        self.discount_factor = discount_factor
        self.gradient_descent_steps = gradient_descent_steps
        self.scope = scope

        #Make sure bounds are same (assumption can be relaxed later)
        np.testing.assert_array_equal(-self.action_bound_low,
                                      self.action_bound_high)

        #Flags
        self.policy_reuse_vars = None
        '''
        self.reward_model = ANN(self.state_dim+self.action_dim, 1)
        self.placeholders_reward = [tf.placeholder(shape=v.shape, dtype=tf.float64)
                                    for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope)]
        self.assign_ops0 = [v.assign(pl) for v, pl in zip(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope),
                            self.placeholders_reward)]
        '''
        #self.reward_model = real_env_pendulum_reward()
        self.reward_model = mountain_car_continuous_reward_function()

        #self.state_model = real_env_pendulum_state()
        #self.state_model = mountain_car_continuous_state_function()
        self.state_model = ANN(self.state_dim + self.action_dim,
                               self.state_dim)
        self.placeholders_state = [
            tf.placeholder(shape=v.shape, dtype=tf.float64)
            for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.state_model.scope)
        ]
        self.assign_ops1 = [
            v.assign(pl) for v, pl in zip(
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.
                                  state_model.scope), self.placeholders_state)
        ]

        #Build computational graph (i.e., unroll policy)
        #self.states = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float32)
        self.states = tf.placeholder(shape=[None, self.state_dim],
                                     dtype=tf.float64)

        self.action = self.build_policy(self.states)
        state = self.states
        action = self.build_policy(state)
        rewards = []
        for i in range(self.unroll_length):
            print i
            #reward = pow(self.discount_factor, i) * self.reward_model.build(state, action)
            #reward = pow(self.discount_factor, i) * self.reward_model.step_tf(state, action)
            reward = pow(self.discount_factor,
                         i) * self.reward_model.sigmoid_approx(state, action)
            rewards.append(reward)
            state = self.state_model.build(state, action)
            #state = self.state_model.step_tf(state, action)
            action = self.build_policy(state)

        rewards = tf.reduce_sum(tf.stack(rewards, axis=-1), axis=-1)
        print 'here0'
        self.loss = -tf.reduce_mean(tf.reduce_sum(rewards, axis=-1))
        print 'here1'
        self.opt = tf.train.AdamOptimizer().minimize(
            self.loss,
            var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.scope))
        print 'here2'

    def act(self, sess, states):
        states = np.atleast_2d(states)
        #print sess.run(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))
        action = sess.run(self.action, feed_dict={self.states: states})
        return action[0]

    def train(self, sess, states):
        for _ in range(self.gradient_descent_steps):
            loss, _ = sess.run([self.loss, self.opt],
                               feed_dict={self.states: states})
            #asin1, asin2, loss, _ = sess.run([self.asin1, self.asin2, self.loss, self.opt], feed_dict={self.states:states})

    def build_policy(self, states):
        assert states.shape.as_list() == [None, self.state_dim]

        #Fully connected layer 1
        fc1 = slim.fully_connected(states,
                                   256,
                                   activation_fn=tf.nn.relu,
                                   scope=self.scope + '/fc1',
                                   reuse=self.policy_reuse_vars)

        fc2 = slim.fully_connected(fc1,
                                   256,
                                   activation_fn=tf.nn.relu,
                                   scope=self.scope + '/fc2',
                                   reuse=self.policy_reuse_vars)

        #Output layer
        output = slim.fully_connected(fc2,
                                      self.action_dim,
                                      activation_fn=tf.nn.tanh,
                                      scope=self.scope + '/output',
                                      reuse=self.policy_reuse_vars)

        #Apply action bounds
        #action_bound = tf.constant(self.action_bound_high, dtype=tf.float32)
        action_bound = tf.constant(self.action_bound_high, dtype=tf.float64)
        policy = tf.multiply(output, action_bound)

        #Change flag
        self.policy_reuse_vars = True

        return policy