def unroll2(self, seed_states): assert seed_states.shape.as_list() == [None, self.state_dim] no_samples = self.no_samples unroll_steps = self.unroll_steps #self.reward_model = real_env_pendulum_reward()#Use true model. self.reward_model = ANN(self.state_dim + self.action_dim, 1) self.placeholders_reward = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope) ] self.assign_ops = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope), self.placeholders_reward) ] states = tf.expand_dims(seed_states, axis=1) states = tf.tile(states, [1, no_samples, 1]) states = tf.reshape(states, shape=[-1, self.state_dim]) costs = [] self.next_states = [] for unroll_step in range(unroll_steps): actions = self.build_policy(states) rewards = (self.discount_factor** unroll_step) * self.reward_model.build(states, actions) rewards = tf.reshape(tf.squeeze(rewards, axis=-1), shape=[-1, no_samples]) costs.append(-rewards) states_actions = tf.concat([states, actions], axis=-1) next_states = self.get_next_states2(states_actions) self.next_states.append(next_states) states = next_states costs = tf.stack(costs, axis=-1) self.loss = tf.reduce_mean( tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1)) self.opt = tf.train.AdamOptimizer().minimize( self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy_scope'))
def __init__(self, environment, x_dim, y_dim, state_dim, action_dim, observation_space_low, observation_space_high, action_space_low, action_space_high, unroll_steps, no_samples, discount_factor, random_matrices, biases, basis_dims, hidden_dim=32, learn_reward=0, use_mean_reward=0, update_hyperstate=1, policy_use_hyperstate=1, learn_diff=0): #assert environment in ['Pendulum-v0', 'MountainCarContinuous-v0'] assert x_dim == state_dim + action_dim assert len(action_space_low.shape) == 1 np.testing.assert_equal(-action_space_low, action_space_high) self.environment = environment self.x_dim = x_dim self.y_dim = y_dim self.state_dim = state_dim self.action_dim = action_dim self.observation_space_low = observation_space_low self.observation_space_high = observation_space_high self.action_space_low = action_space_low self.action_space_high = action_space_high self.unroll_steps = unroll_steps self.no_samples = no_samples self.discount_factor = discount_factor self.random_matrices = random_matrices self.biases = biases self.basis_dims = basis_dims self.hidden_dim = hidden_dim self.learn_reward = learn_reward self.use_mean_reward = use_mean_reward self.update_hyperstate = update_hyperstate self.policy_use_hyperstate = policy_use_hyperstate self.learn_diff = learn_diff if self.environment == 'Pendulum-v0' and self.learn_reward == 0: #self.reward_function = real_env_pendulum_reward() self.reward_function = ANN(self.state_dim + self.action_dim, 1) self.placeholders_reward = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_function.scope) ] self.assign_ops0 = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_function.scope), self.placeholders_reward) ] elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0: self.reward_function = mountain_car_continuous_reward_function() #self.hyperstate_dim = sum([(basis_dim*(basis_dim+1))/2 + basis_dim for basis_dim in self.basis_dims]) self.hyperstate_dim = sum( [basis_dim * (basis_dim + 1) for basis_dim in self.basis_dims]) self.random_projection_matrix = np.random.normal( loc=0., scale=1. / np.sqrt(self.state_dim), size=[self.hyperstate_dim, self.state_dim]) input_dim = self.state_dim if self.policy_use_hyperstate == 1: input_dim *= 2 self.w1 = np.concatenate([ np.random.normal(size=[input_dim, self.hidden_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim]) ], axis=0) self.w2 = np.concatenate([ np.random.normal(size=[self.hidden_dim, self.hidden_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim]) ], axis=0) self.w3 = np.concatenate([ np.random.normal(size=[self.hidden_dim, self.action_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.action_dim]) ], axis=0) self.thetas = self._pack([self.w1, self.w2, self.w3]) self.sizes = [[input_dim + 1, self.hidden_dim], [self.hidden_dim + 1, self.hidden_dim], [self.hidden_dim + 1, self.action_dim]] w1, w2, w3 = self._unpack(self.thetas, self.sizes) np.testing.assert_equal(w1, self.w1) np.testing.assert_equal(w2, self.w2) np.testing.assert_equal(w3, self.w3)
class Agent: def __init__(self, environment, x_dim, y_dim, state_dim, action_dim, observation_space_low, observation_space_high, action_space_low, action_space_high, unroll_steps, no_samples, discount_factor, random_matrices, biases, basis_dims, hidden_dim=32, learn_reward=0, use_mean_reward=0, update_hyperstate=1, policy_use_hyperstate=1, learn_diff=0): #assert environment in ['Pendulum-v0', 'MountainCarContinuous-v0'] assert x_dim == state_dim + action_dim assert len(action_space_low.shape) == 1 np.testing.assert_equal(-action_space_low, action_space_high) self.environment = environment self.x_dim = x_dim self.y_dim = y_dim self.state_dim = state_dim self.action_dim = action_dim self.observation_space_low = observation_space_low self.observation_space_high = observation_space_high self.action_space_low = action_space_low self.action_space_high = action_space_high self.unroll_steps = unroll_steps self.no_samples = no_samples self.discount_factor = discount_factor self.random_matrices = random_matrices self.biases = biases self.basis_dims = basis_dims self.hidden_dim = hidden_dim self.learn_reward = learn_reward self.use_mean_reward = use_mean_reward self.update_hyperstate = update_hyperstate self.policy_use_hyperstate = policy_use_hyperstate self.learn_diff = learn_diff if self.environment == 'Pendulum-v0' and self.learn_reward == 0: #self.reward_function = real_env_pendulum_reward() self.reward_function = ANN(self.state_dim + self.action_dim, 1) self.placeholders_reward = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_function.scope) ] self.assign_ops0 = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_function.scope), self.placeholders_reward) ] elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0: self.reward_function = mountain_car_continuous_reward_function() #self.hyperstate_dim = sum([(basis_dim*(basis_dim+1))/2 + basis_dim for basis_dim in self.basis_dims]) self.hyperstate_dim = sum( [basis_dim * (basis_dim + 1) for basis_dim in self.basis_dims]) self.random_projection_matrix = np.random.normal( loc=0., scale=1. / np.sqrt(self.state_dim), size=[self.hyperstate_dim, self.state_dim]) input_dim = self.state_dim if self.policy_use_hyperstate == 1: input_dim *= 2 self.w1 = np.concatenate([ np.random.normal(size=[input_dim, self.hidden_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim]) ], axis=0) self.w2 = np.concatenate([ np.random.normal(size=[self.hidden_dim, self.hidden_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim]) ], axis=0) self.w3 = np.concatenate([ np.random.normal(size=[self.hidden_dim, self.action_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.action_dim]) ], axis=0) self.thetas = self._pack([self.w1, self.w2, self.w3]) self.sizes = [[input_dim + 1, self.hidden_dim], [self.hidden_dim + 1, self.hidden_dim], [self.hidden_dim + 1, self.action_dim]] w1, w2, w3 = self._unpack(self.thetas, self.sizes) np.testing.assert_equal(w1, self.w1) np.testing.assert_equal(w2, self.w2) np.testing.assert_equal(w3, self.w3) def _pack(self, thetas): return np.concatenate([theta.flatten() for theta in thetas]) def _unpack(self, thetas, sizes): sidx = 0 weights = [] for size in sizes: i, j = size w = thetas[sidx:sidx + i * j].reshape([i, j]) sidx += i * j weights.append(w) return weights def _forward(self, thetas, X, hyperstate): #"Old" method of including hyperstate into policy network. ''' w0, w1, w2, w3 = self._unpack(thetas, self.sizes) XXtr, Xytr = hyperstate A = [xx + noise for xx, noise in zip(XXtr, self.noises)] wn = [solve(a, xy) for a, xy in zip(A, Xytr)] indices = [np.triu_indices(basis_dim, 1) for basis_dim in self.basis_dims] hyperstate = [] for i in range(len(X)): tmp0 = [] for j in range(len(A)): A[j][i][indices[j]] = np.nan tmp1 = A[j][i] tmp0.append(tmp1[~np.isnan(tmp1)]) tmp0.append(np.squeeze(wn[j][i])) tmp0 = np.concatenate(tmp0) hyperstate.append(tmp0) hyperstate = np.stack(hyperstate, axis=0) hyperstate = self._add_bias(hyperstate) hyperstate_embedding = np.tanh(np.matmul(hyperstate, w0)) ''' w1, w2, w3 = self._unpack(thetas, self.sizes) #Perform a simple random projection on the hyperstate. if self.policy_use_hyperstate == 1: hyperstate = np.concatenate([ np.concatenate([ np.reshape(XXtr, [len(XXtr), -1]), np.reshape(Xytr, [len(Xytr), -1]) ], axis=-1) for XXtr, Xytr in zip(*hyperstate) ], axis=-1) hyperstate = np.tanh(hyperstate / 50000.) hyperstate_embedding = np.matmul(hyperstate, self.random_projection_matrix) hyperstate_embedding = np.tanh(hyperstate_embedding) state_hyperstate = np.concatenate([X, hyperstate_embedding], axis=-1) policy_net_input = self._add_bias(state_hyperstate) else: policy_net_input = self._add_bias(X) h1 = np.tanh(np.matmul(policy_net_input, w1)) h1 = self._add_bias(h1) h2 = np.tanh(np.matmul(h1, w2)) h2 = self._add_bias(h2) out = np.tanh(np.matmul(h2, w3)) out = out * self.action_space_high #action bounds. return out def _add_bias(self, X): assert len(X.shape) == 2 return np.concatenate([X, np.ones([len(X), 1])], axis=-1) def _relu(self, X): return np.maximum(X, 0.) def _fit(self, cma_maxiter, X, XXtr, Xytr, hyperparameters, sess): warnings.filterwarnings( 'ignore', message= '.*scipy.linalg.solve\nIll-conditioned matrix detected. Result is not guaranteed to be accurate.\nReciprocal.*' ) assert len(XXtr) == self.state_dim + self.learn_reward assert len(Xytr) == self.state_dim + self.learn_reward assert len(hyperparameters) == self.state_dim + self.learn_reward if self.use_mean_reward == 1: print 'Warning: use_mean_reward is set to True but this flag is not used by this function.' X = np.copy(X) XXtr = [np.copy(ele) for ele in XXtr] Xytr = [np.copy(ele) for ele in Xytr] hyperparameters = [np.copy(ele) for ele in hyperparameters] X = np.expand_dims(X, axis=1) X = np.tile(X, [1, self.no_samples, 1]) X = np.reshape(X, [-1, self.state_dim]) Llowers = [ scipy.linalg.cholesky( (hp[-2] / hp[-1])**2 * np.eye(basis_dim) + XX, lower=True) for hp, basis_dim, XX in zip(hyperparameters, self.basis_dims, XXtr) ] Llowers = [ np.tile(ele[np.newaxis, ...], [len(X), 1, 1]) for ele in Llowers ] XXtr = [np.tile(ele[np.newaxis, ...], [len(X), 1, 1]) for ele in XXtr] Xytr = [np.tile(ele[np.newaxis, ...], [len(X), 1, 1]) for ele in Xytr] self.noises = [ (hp[2] / hp[3])**2 * np.eye(basis_dim) for hp, basis_dim in zip(hyperparameters, self.basis_dims) ] import cma options = {'maxiter': cma_maxiter, 'verb_disp': 1, 'verb_log': 0} print 'Before calling cma.fmin' res = cma.fmin(self._loss, self.thetas, 2., args=(np.copy(X), [np.copy(ele) for ele in Llowers ], [np.copy(ele) for ele in XXtr], [np.copy(ele) for ele in Xytr], None, [np.copy(ele) for ele in hyperparameters], sess), options=options) self.thetas = np.copy(res[0]) def _predict(self, Llower, Xytr, basis, noise_sd): ''' Llower = Llower[0] Xytr = Xytr[0] basis = np.squeeze(basis, axis=1) LinvXT = scipy.linalg.solve_triangular(Llower, basis.T, lower=True) pred_sigma = np.sum(np.square(LinvXT), axis=0)*noise_sd**2+noise_sd**2 pred_sigma = pred_sigma[..., np.newaxis] tmp0 = scipy.linalg.solve_triangular(Llower, basis.T, lower=True).T tmp1 = scipy.linalg.solve_triangular(Llower, Xytr, lower=True) pred_mu = np.matmul(tmp0, tmp1) return pred_mu, pred_sigma ''' #TODO:fix this. LinvXT = solve_triangular(Llower, np.transpose(basis, [0, 2, 1])) pred_sigma = np.sum(np.square(LinvXT), axis=1) * noise_sd**2 + noise_sd**2 tmp0 = np.transpose( solve_triangular(Llower, np.transpose(basis, [0, 2, 1])), [0, 2, 1]) tmp1 = solve_triangular(Llower, Xytr) pred_mu = np.matmul(tmp0, tmp1) pred_mu = np.squeeze(pred_mu, axis=-1) return pred_mu, pred_sigma def _loss(self, thetas, X, Llowers, XXtr, Xytr, A=[], hyperparameters=None, sess=None): rng_state = np.random.get_state() X = np.copy(X) Llowers = [np.copy(ele) for ele in Llowers] XXtr = [np.copy(ele) for ele in XXtr] Xytr = [np.copy(ele) for ele in Xytr] hyperparameters = [np.copy(ele) for ele in hyperparameters] try: np.random.seed(2) rewards = [] state = X for unroll_step in xrange(self.unroll_steps): action = self._forward(thetas, state, hyperstate=[Llowers, Xytr]) reward, basis_reward = self._reward(state, action, sess, Llowers[-1], Xytr[-1], hyperparameters[-1]) rewards.append((self.discount_factor**unroll_step) * reward) state_action = np.concatenate([state, action], axis=-1) means = [] covs = [] bases = [] for i in xrange(self.state_dim): length_scale, signal_sd, noise_sd, prior_sd = hyperparameters[ i] basis = _basis(state_action, self.random_matrices[i], self.biases[i], self.basis_dims[i], length_scale, signal_sd) basis = np.expand_dims(basis, axis=1) bases.append(basis) pred_mu, pred_sigma = self._predict( Llowers[i], Xytr[i], basis, noise_sd) means.append(pred_mu) covs.append(pred_sigma) means = np.concatenate(means, axis=-1) covs = np.concatenate(covs, axis=-1) bases.append(basis_reward) state_ = np.stack([ np.random.multivariate_normal(mean=mean, cov=np.diag(cov)) for mean, cov in zip(means, covs) ], axis=0) state = state + state_ if self.learn_diff else state_ if self.learn_diff == 0: state_ = np.clip(state_, self.observation_space_low, self.observation_space_high) state = np.clip(state, self.observation_space_low, self.observation_space_high) # #Removable # import copy # Llowers2 = copy.deepcopy(Llowers) # Xytr2 = copy.deepcopy(Xytr) # XXtr2 = copy.deepcopy(XXtr) # #Removable -END- if self.update_hyperstate == 1 or self.policy_use_hyperstate == 1: y = np.concatenate([state_, reward], axis=-1)[..., :self.state_dim + self.learn_reward] y = y[..., np.newaxis, np.newaxis] for i in xrange(self.state_dim + self.learn_reward): Llowers[i] = Llowers[i].transpose([0, 2, 1]) for i in xrange(self.state_dim + self.learn_reward): for j in xrange(len(Llowers[i])): cholupdate(Llowers[i][j], bases[i][j, 0].copy()) Xytr[i] += np.matmul(bases[i].transpose([0, 2, 1]), y[:, i, ...]) # #Removable # _, _, noise_sd, prior_sd = hyperparameters[i] # XXtr2[i], Xytr2[i], Llowers2[i] = self._update_hyperstate(XXtr2[i], XXtr2[i] + np.matmul(np.transpose(bases[i], [0, 2, 1]), bases[i]), Xytr2[i], Xytr2[i] + np.matmul(np.transpose(bases[i], [0, 2, 1]), y[:, i, ...]), Llowers2[i], (noise_sd/prior_sd)**2) # print i # print np.allclose(Llowers[i], Llowers2[i].transpose([0, 2, 1])) # print np.allclose(Xytr[i], Xytr2[i]) # #Removable -END- for i in xrange(self.state_dim + self.learn_reward): Llowers[i] = Llowers[i].transpose([0, 2, 1]) rewards = np.concatenate(rewards, axis=-1) rewards = np.sum(rewards, axis=-1) loss = -np.mean(rewards) np.random.set_state(rng_state) return loss except Exception as e: np.random.set_state(rng_state) print e, 'Returning 10e100' return 10e100 def _update_hyperstate(self, XXold, XXnew, Xyold, Xynew, Llowerold, var_ratio): var_diag = var_ratio * np.eye(XXnew.shape[-1]) XX = [] Xy = [] Llower = [] for i in range(len(XXnew)): try: tmp = scipy.linalg.cholesky(XXnew[i] + var_diag, lower=True) XX.append(XXnew[i].copy()) Xy.append(Xynew[i].copy()) Llower.append(tmp.copy()) except Exception as e: XX.append(XXold[i].copy()) Xy.append(Xyold[i].copy()) Llower.append(Llowerold[i].copy()) XX = np.stack(XX, axis=0) Xy = np.stack(Xy, axis=0) Llower = np.stack(Llower, axis=0) return XX, Xy, Llower def _reward(self, state, action, sess, Llower, Xy, hyperparameters): basis = None if self.environment == 'Pendulum-v0' and self.learn_reward == 0: reward = self.reward_function.build_np(sess, state, action) elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0: reward = self.reward_function.build_np(state, action) else: state_action = np.concatenate([state, action], axis=-1) length_scale, signal_sd, noise_sd, prior_sd = hyperparameters basis = _basis(state_action, self.random_matrices[-1], self.biases[-1], self.basis_dims[-1], length_scale, signal_sd) basis = np.expand_dims(basis, axis=1) pred_mu, pred_sigma = self._predict(Llower, Xy, basis, noise_sd) if self.use_mean_reward == 1: pred_sigma = np.zeros_like(pred_sigma) reward = np.stack([ np.random.normal(loc=loc, scale=scale) for loc, scale in zip(pred_mu, pred_sigma) ], axis=0) return reward, basis
class Agent: def __init__(self, environment, x_dim, y_dim, state_dim, action_dim, observation_space_low, observation_space_high, action_space_low, action_space_high, unroll_steps, no_samples, discount_factor, random_matrix_state, bias_state, basis_dim_state, random_matrix_reward, bias_reward, basis_dim_reward, hidden_dim=32, learn_reward=0, use_mean_reward=0, update_hyperstate=1, policy_use_hyperstate=1, learn_diff=0, dump_model=0): #assert environment in ['Pendulum-v0', 'MountainCarContinuous-v0'] assert x_dim == state_dim + action_dim assert len(action_space_low.shape) == 1 np.testing.assert_equal(-action_space_low, action_space_high) self.environment = environment self.x_dim = x_dim self.y_dim = y_dim self.state_dim = state_dim self.action_dim = action_dim self.observation_space_low = observation_space_low self.observation_space_high = observation_space_high self.action_space_low = action_space_low self.action_space_high = action_space_high self.unroll_steps = unroll_steps self.no_samples = no_samples self.discount_factor = discount_factor self.random_matrix_state = random_matrix_state self.bias_state = bias_state self.basis_dim_state = basis_dim_state self.random_matrix_reward = random_matrix_reward self.bias_reward = bias_reward self.basis_dim_reward = basis_dim_reward self.hidden_dim = hidden_dim self.learn_reward = learn_reward self.use_mean_reward = use_mean_reward self.update_hyperstate = update_hyperstate self.policy_use_hyperstate = policy_use_hyperstate self.learn_diff = learn_diff self.dump_model = dump_model self.uid = str(uuid.uuid4()) self.epoch = 0 if self.environment == 'Pendulum-v0' and self.learn_reward == 0: #self.reward_function = real_env_pendulum_reward() self.reward_function = ANN(self.state_dim + self.action_dim, 1) self.placeholders_reward = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_function.scope) ] self.assign_ops0 = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_function.scope), self.placeholders_reward) ] elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0: self.reward_function = mountain_car_continuous_reward_function() self.hyperstate_dim = self.basis_dim_state * (self.basis_dim_state + self.state_dim) if self.learn_reward == 1: self.hyperstate_dim += self.basis_dim_reward * ( self.basis_dim_reward + 1) self.random_projection_matrix = np.random.normal( loc=0., scale=1. / np.sqrt(self.state_dim), size=[self.hyperstate_dim, self.state_dim]) input_dim = self.state_dim if self.policy_use_hyperstate == 1: input_dim *= 2 self.w1 = np.concatenate([ np.random.normal(size=[input_dim, self.hidden_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim]) ], axis=0) self.w2 = np.concatenate([ np.random.normal(size=[self.hidden_dim, self.hidden_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim]) ], axis=0) self.w3 = np.concatenate([ np.random.normal(size=[self.hidden_dim, self.action_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.action_dim]) ], axis=0) self.thetas = self._pack([self.w1, self.w2, self.w3]) self.sizes = [[input_dim + 1, self.hidden_dim], [self.hidden_dim + 1, self.hidden_dim], [self.hidden_dim + 1, self.action_dim]] w1, w2, w3 = self._unpack(self.thetas, self.sizes) np.testing.assert_equal(w1, self.w1) np.testing.assert_equal(w2, self.w2) np.testing.assert_equal(w3, self.w3) def _pack(self, thetas): return np.concatenate([theta.flatten() for theta in thetas]) def _unpack(self, thetas, sizes): sidx = 0 weights = [] for size in sizes: i, j = size w = thetas[sidx:sidx + i * j].reshape([i, j]) sidx += i * j weights.append(w) return weights def _forward(self, thetas, X, hyperstate_params): w1, w2, w3 = self._unpack(thetas, self.sizes) #Perform a simple random projection on the hyperstate. if self.policy_use_hyperstate == 1: Llower_state, Xytr_state, Llower_reward, Xytr_reward = hyperstate_params hyperstate = np.concatenate([ Llower_state.reshape([len(Llower_state), -1]), Xytr_state.reshape([len(Xytr_state), -1]), Llower_reward.reshape([len(Llower_reward), -1]), Xytr_reward.reshape([len(Xytr_reward), -1]) ], axis=-1) hyperstate = np.tanh(hyperstate / 50000.) hyperstate_embedding = np.matmul(hyperstate, self.random_projection_matrix) hyperstate_embedding = np.tanh(hyperstate_embedding) state_hyperstate = np.concatenate([X, hyperstate_embedding], axis=-1) policy_net_input = self._add_bias(state_hyperstate) else: policy_net_input = self._add_bias(X) h1 = np.tanh(np.matmul(policy_net_input, w1)) h1 = self._add_bias(h1) h2 = np.tanh(np.matmul(h1, w2)) h2 = self._add_bias(h2) out = np.tanh(np.matmul(h2, w3)) out = out * self.action_space_high #action bounds. return out def _add_bias(self, X): assert len(X.shape) == 2 return np.concatenate([X, np.ones([len(X), 1])], axis=-1) def _relu(self, X): return np.maximum(X, 0.) def _fit(self, cma_maxiter, X, XXtr_state, Xytr_state, hyperparameters_state, XXtr_reward, Xytr_reward, hyperparameters_reward, sess): warnings.filterwarnings( 'ignore', message= '.*scipy.linalg.solve\nIll-conditioned matrix detected. Result is not guaranteed to be accurate.\nReciprocal.*' ) assert XXtr_state.shape == (self.basis_dim_state, self.basis_dim_state) assert Xytr_state.shape == (self.basis_dim_state, self.state_dim) assert XXtr_reward.shape == (self.basis_dim_reward, self.basis_dim_reward) assert Xytr_reward.shape == (self.basis_dim_reward, 1) assert hyperparameters_state.shape == hyperparameters_reward.shape if self.use_mean_reward == 1: print( 'Warning: use_mean_reward is set to True but this flag is not used by this function.' ) #Copy the arrays (just to be safe no overwriting occurs). X = X.copy() XXtr_state = XXtr_state.copy() Xytr_state = Xytr_state.copy() hyperparameters_state = hyperparameters_state.copy() XXtr_reward = XXtr_reward.copy() Xytr_reward = Xytr_reward.copy() hyperparameters_reward = hyperparameters_reward.copy() X = np.expand_dims(X, axis=1) X = np.tile(X, [1, self.no_samples, 1]) X = np.reshape(X, [-1, self.state_dim]) #State Llower_state = spla.cholesky( (hyperparameters_state[-2] / hyperparameters_state[-1])**2 * np.eye(self.basis_dim_state) + XXtr_state, lower=True) Llower_state = np.tile(Llower_state, [len(X), 1, 1]) XXtr_state = np.tile(XXtr_state, [len(X), 1, 1]) Xytr_state = np.tile(Xytr_state, [len(X), 1, 1]) #Reward if self.learn_reward: Llower_reward = spla.cholesky( (hyperparameters_reward[-2] / hyperparameters_reward[-1])**2 * np.eye(self.basis_dim_reward) + XXtr_reward, lower=True) Llower_reward = np.tile(Llower_reward, [len(X), 1, 1]) XXtr_reward = np.tile(XXtr_reward, [len(X), 1, 1]) Xytr_reward = np.tile(Xytr_reward, [len(X), 1, 1]) import cma options = {'maxiter': cma_maxiter, 'verb_disp': 1, 'verb_log': 0} print('Before calling cma.fmin') res = cma.fmin( self._loss, self.thetas, 2., args=(X.copy(), Llower_state.copy(), XXtr_state.copy(), Xytr_state.copy(), hyperparameters_state, Llower_reward.copy() if self.learn_reward else None, XXtr_reward.copy() if self.learn_reward else None, Xytr_reward.copy() if self.learn_reward else None, hyperparameters_reward if self.learn_reward else None, sess), options=options) self.thetas = np.copy(res[0]) if self.dump_model: print('Unique identifier:', self.uid) directory = './models/' if not os.path.exists(directory): os.makedirs(directory) with open( directory + self.uid + '_epoch:' + str(self.epoch) + '.p', 'wb') as fp: pickle.dump(self.thetas, fp) self.epoch += 1 def _predict(self, Llower, Xytr, basis, noise_sd): #TODO: fix this. LinvXT = solve_triangular(Llower, basis.transpose([0, 2, 1])) sigma = np.sum(np.square(LinvXT), axis=1) * noise_sd**2 + noise_sd**2 tmp0 = solve_triangular(Llower, basis.transpose([0, 2, 1])).transpose([0, 2, 1]) tmp1 = solve_triangular(Llower, Xytr) mu = np.matmul(tmp0, tmp1).squeeze(axis=1) return mu, sigma def _loss(self, thetas, X, Llower_state, XXtr_state, Xytr_state, hyperparameters_state, Llower_reward, XXtr_reward, Xytr_reward, hyperparameters_reward, sess=None): X = X.copy() Llower_state = Llower_state.copy() XXtr_state = XXtr_state.copy() Xytr_state = Xytr_state.copy() hyperparameters_state = hyperparameters_state.copy() if self.learn_reward: Llower_reward = Llower_reward.copy() XXtr_reward = XXtr_reward.copy() Xytr_reward = Xytr_reward.copy() hyperparameters_reward = hyperparameters_reward.copy() rng_state = np.random.get_state() #try: np.random.seed(2) rewards = [] state = X for unroll_step in xrange(self.unroll_steps): action = self._forward(thetas, state, hyperstate_params=[ Llower_state, Xytr_state, Llower_reward, Xytr_reward ]) state_action = np.concatenate([state, action], axis=-1) reward, basis_reward = self._reward(state, action, state_action, sess, Llower_reward, Xytr_reward, hyperparameters_reward) rewards.append((self.discount_factor**unroll_step) * reward) length_scale, signal_sd, noise_sd, prior_sd = hyperparameters_state basis_state = _basis(state_action, self.random_matrix_state, self.bias_state, self.basis_dim_state, length_scale, signal_sd) basis_state = basis_state[:, None, ...] mu, sigma = self._predict(Llower_state, Xytr_state, basis_state, noise_sd) state_ = mu + np.sqrt(sigma) * np.random.standard_normal( size=mu.shape) if self.learn_diff: state_tmp = state.copy() state = np.clip(state + state_, self.observation_space_low, self.observation_space_high) state_ = state - state_tmp else: state_ = np.clip(state_, self.observation_space_low, self.observation_space_high) state = state_.copy() if self.update_hyperstate == 1 or self.policy_use_hyperstate == 1: #Update state hyperstate Llower_state = Llower_state.transpose([0, 2, 1]) for i in range(len(Llower_state)): cholupdate(Llower_state[i], basis_state[i, 0].copy()) Llower_state = Llower_state.transpose([0, 2, 1]) Xytr_state += np.matmul(basis_state.transpose([0, 2, 1]), state_[..., None, :]) #Update reward hyperstate if self.learn_reward: Llower_reward = Llower_reward.transpose([0, 2, 1]) for i in range(len(Llower_reward)): cholupdate(Llower_reward[i], basis_reward[i, 0].copy()) Llower_reward = Llower_reward.transpose([0, 2, 1]) Xytr_reward += np.matmul(basis_reward.transpose([0, 2, 1]), reward[..., None, :]) rewards = np.concatenate(rewards, axis=-1) rewards = np.sum(rewards, axis=-1) loss = -np.mean(rewards) np.random.set_state(rng_state) return loss #except Exception as e: #np.random.set_state(rng_state) #print e, 'Returning 10e100' #return 10e100 def _reward(self, state, action, state_action, sess, Llower, Xy, hyperparameters): basis = None if self.environment == 'Pendulum-v0' and self.learn_reward == 0: reward = self.reward_function.build_np(sess, state, action) elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0: reward = self.reward_function.build_np(state, action) else: #state_action = np.concatenate([state, action], axis=-1) length_scale, signal_sd, noise_sd, prior_sd = hyperparameters basis = _basis(state_action, self.random_matrix_reward, self.bias_reward, self.basis_dim_reward, length_scale, signal_sd) basis = basis[:, None, ...] mu, sigma = self._predict(Llower, Xy, basis, noise_sd) if self.use_mean_reward == 1: sigma = np.zeros_like(sigma) reward = mu + np.sqrt(sigma) * np.random.standard_normal( size=mu.shape) return reward, basis
class blr_model: def __init__(self, x_dim, y_dim, state_dim, action_dim, observation_space_low, observation_space_high, action_bound_low, action_bound_high, unroll_steps, no_samples, no_basis, discount_factor, train_policy_batch_size, train_policy_iterations, hyperparameters, debugging_plot): assert x_dim == state_dim + action_dim assert len(hyperparameters) == y_dim self.x_dim = x_dim self.y_dim = y_dim self.state_dim = state_dim self.action_dim = action_dim self.observation_space_low = observation_space_low self.observation_space_high = observation_space_high self.action_bound_low = action_bound_low self.action_bound_high = action_bound_high self.unroll_steps = unroll_steps self.no_samples = no_samples self.no_basis = no_basis self.discount_factor = discount_factor self.train_policy_batch_size = train_policy_batch_size self.train_policy_iterations = train_policy_iterations self.hyperparameters = hyperparameters self.debugging_plot = debugging_plot self.policy_scope = 'policy_scope' self.policy_reuse_vars = None self.models = [ bayesian_model(self.x_dim, self.observation_space_low, self.observation_space_high, self.action_bound_low, self.action_bound_high, self.no_basis, *self.hyperparameters[i]) for i in range(self.y_dim) ] self.states = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float64) self.batch_size = tf.shape(self.states)[0] #self.batch_size = 3 self.actions = self.build_policy(self.states) self.cum_xx = [ tf.tile(tf.expand_dims(model.cum_xx_pl, axis=0), [self.batch_size * self.no_samples, 1, 1]) for model in self.models ] self.cum_xy = [ tf.tile(tf.expand_dims(model.cum_xy_pl, axis=0), [self.batch_size * self.no_samples, 1, 1]) for model in self.models ] self.unroll(self.states) #self.unroll2(self.states) #TODO: for debugging purposes def unroll2(self, seed_states): assert seed_states.shape.as_list() == [None, self.state_dim] no_samples = self.no_samples unroll_steps = self.unroll_steps #self.reward_model = real_env_pendulum_reward()#Use true model. self.reward_model = ANN(self.state_dim + self.action_dim, 1) self.placeholders_reward = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope) ] self.assign_ops = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope), self.placeholders_reward) ] states = tf.expand_dims(seed_states, axis=1) states = tf.tile(states, [1, no_samples, 1]) states = tf.reshape(states, shape=[-1, self.state_dim]) costs = [] self.next_states = [] for unroll_step in range(unroll_steps): actions = self.build_policy(states) rewards = (self.discount_factor** unroll_step) * self.reward_model.build(states, actions) rewards = tf.reshape(tf.squeeze(rewards, axis=-1), shape=[-1, no_samples]) costs.append(-rewards) states_actions = tf.concat([states, actions], axis=-1) next_states = self.get_next_states2(states_actions) self.next_states.append(next_states) states = next_states costs = tf.stack(costs, axis=-1) self.loss = tf.reduce_mean( tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1)) self.opt = tf.train.AdamOptimizer().minimize( self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy_scope')) #TODO: for debugging purposes def get_next_states(self, states_actions): self.string = 'unroll2_gns' mu, sigma = [ tf.concat(e, axis=-1) for e in zip(*[ model.posterior_predictive_distribution(states_actions, None) for model in self.models ]) ] self.mus1.append(mu) self.sigmas1.append(sigma) #print mu.shape #print sigma.shape next_state = tfd.MultivariateNormalDiag( loc=mu, scale_diag=tf.sqrt(sigma)).sample() return next_state #TODO: for debugging purposes def get_next_states2(self, states_actions): self.string = 'unroll2_gns2' mus = [] sigmas = [] for model in self.models: mu, sigma = model.mu_sigma(model.cum_xx_pl, model.cum_xy_pl) post_pred_mu, post_pred_sigma = model.post_pred2( states_actions, mu, sigma) mus.append(post_pred_mu) sigmas.append(post_pred_sigma) mus = tf.concat(mus, axis=-1) sigmas = tf.concat(sigmas, axis=-1) self.mus2.append(mus) self.sigmas2.append(sigmas) #print mus.shape #print sigmas.shape next_state = tfd.MultivariateNormalDiag( loc=mus, scale_diag=tf.sqrt(sigmas)).sample() return next_state def unroll(self, seed_states): assert seed_states.shape.as_list() == [None, self.state_dim] no_samples = self.no_samples unroll_steps = self.unroll_steps #self.reward_model = real_env_pendulum_reward()#Use true model. self.reward_model = ANN(self.state_dim + self.action_dim, 1) self.placeholders_reward = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope) ] self.assign_ops = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope), self.placeholders_reward) ] states = tf.expand_dims(seed_states, axis=1) states = tf.tile(states, [1, no_samples, 1]) states = tf.reshape(states, shape=[-1, self.state_dim]) self.mus0 = [] self.sigmas0 = [] self.mus1 = [] self.sigmas1 = [] self.mus2 = [] self.sigmas2 = [] costs = [] self.next_states = [] #ns = [] #bs = [] for unroll_step in range(unroll_steps): print 'unrolling:', unroll_step if self.debugging_plot == True: actions = self.build_policy2(states) else: actions = self.build_policy(states) # Reward rewards = (self.discount_factor** unroll_step) * self.reward_model.build(states, actions) rewards = tf.reshape(tf.squeeze(rewards, axis=-1), shape=[-1, no_samples]) costs.append(-rewards) states_actions = tf.concat([states, actions], axis=-1) mus, sigmas = zip(*[ self.mu_sigma(self.cum_xx[y], self.cum_xy[y], self.models[y].s, self.models[y].noise_sd) for y in range(self.y_dim) ]) bases = [ model.approx_rbf_kern_basis(states_actions) for model in self.models ] #bs.append(bases) mu_pred, sigma_pred = [ tf.concat(e, axis=-1) for e in zip(*[ self.prediction(mu, sigma, basis, model.noise_sd) for mu, sigma, basis, model in zip(mus, sigmas, bases, self.models) ]) ] self.mus0.append(mu_pred) self.sigmas0.append(sigma_pred) self.get_next_states(states_actions) self.get_next_states2(states_actions) next_states = tfd.MultivariateNormalDiag( loc=mu_pred, scale_diag=tf.sqrt(sigma_pred)).sample() #ns.append(tf.split(next_states, self.y_dim, axis=-1)) self.next_states.append( tf.reshape(next_states, shape=[-1, no_samples, self.state_dim])) for y in range(self.y_dim): self.update_posterior(bases[y], next_states[..., y:y + 1], y) states = next_states if self.debugging_plot == False: print 'here1' costs = tf.stack(costs, axis=-1) print 'here2' self.loss = tf.reduce_mean( tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1)) print 'here3' self.opt = tf.train.AdamOptimizer().minimize( self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy_scope')) print 'here4' self.string = 'unroll' def update_posterior(self, X, y, i): X_expanded_dims = tf.expand_dims(X, axis=-1) y_expanded_dims = tf.expand_dims(y, axis=-1) self.cum_xx[i] += tf.matmul( X_expanded_dims, tf.transpose(X_expanded_dims, perm=[0, 2, 1])) self.cum_xy[i] += tf.matmul(X_expanded_dims, y_expanded_dims) def prediction(self, mu, sigma, basis, noise_sd): basis_expanded_dims = tf.expand_dims(basis, axis=-1) mu_pred = tf.matmul(tf.transpose(mu, perm=[0, 2, 1]), basis_expanded_dims) sigma_pred = tf.square(noise_sd) + tf.matmul( tf.matmul(tf.transpose(basis_expanded_dims, perm=[0, 2, 1]), sigma), basis_expanded_dims) return tf.squeeze(mu_pred, axis=-1), tf.squeeze(sigma_pred, axis=-1) def mu_sigma(self, xx, xy, s, noise_sd): noise_sd_sq = tf.square(noise_sd) prior_sigma_inv = tf.matrix_inverse( tf.tile( tf.expand_dims(s * tf.eye(self.no_basis, dtype=tf.float64), axis=0), [self.batch_size * self.no_samples, 1, 1])) A = tf.matrix_inverse(tf.multiply(noise_sd_sq, prior_sigma_inv) + xx) sigma = tf.multiply(noise_sd_sq, A) # Assuming that prior mean is zero vector mu = tf.matmul(A, xy) return mu, sigma def mu_sigma2(self, xx, xy, s, noise_sd, bs, ns, idx): if bs and ns: assert len(zip(*bs)) == self.y_dim assert len(zip(*ns)) == self.y_dim X = zip(*bs)[idx] y = zip(*ns)[idx] X = tf.expand_dims(tf.stack(X, axis=0), axis=-1) XX = tf.matmul(X, tf.transpose(X, perm=[0, 1, 3, 2])) y = tf.expand_dims(tf.stack(y, axis=0), axis=-1) Xy = tf.matmul(X, y) XX_ = tf.reduce_sum(XX, axis=0) Xy_ = tf.reduce_sum(Xy, axis=0) else: XX_ = 0. Xy_ = 0. noise_sd_sq = tf.square(noise_sd) prior_sigma_inv = tf.matrix_inverse( tf.tile( tf.expand_dims(s * tf.eye(self.no_basis, dtype=tf.float64), axis=0), [self.batch_size * self.no_samples, 1, 1])) A = tf.matrix_inverse( tf.multiply(noise_sd_sq, prior_sigma_inv) + xx + XX_) sigma = tf.multiply(noise_sd_sq, A) # Assuming that prior mean is zero vector mu = tf.matmul(A, xy + Xy_) return mu, sigma def update(self, sess, X=None, y=None, memory=None): if memory is not None: states = np.stack([e[0] for e in memory], axis=0) actions = np.stack([e[1] for e in memory], axis=0) y = np.stack([e[3] for e in memory], axis=0) X = np.concatenate([states, actions], axis=-1) for i in range(self.y_dim): self.models[i].update(sess, X, y[..., i]) def act(self, sess, state): state = np.atleast_2d(state) action = sess.run(self.actions, feed_dict={self.states: state}) return action[0] def train(self, sess, memory): feed_dict = {} #TODO: for debugging purposes if self.string == 'unroll': for model in self.models: feed_dict[model.cum_xx_pl] = model.cum_xx feed_dict[model.cum_xy_pl] = model.cum_xy feed_dict[model.mu_placeholder] = model.mu #for testing feed_dict[model.sigma_placeholder] = model.sigma #for testing feed_dict[ model.sigma_prior_pl] = model.sigma_prior #for testing feed_dict[model.mu_prior_pl] = model.mu_prior #for testing elif self.string == 'unroll2_gns': for model in self.models: feed_dict[model.mu_placeholder] = model.mu feed_dict[model.sigma_placeholder] = model.sigma elif self.string == 'unroll2_gns2': for model in self.models: feed_dict[model.cum_xx_pl] = model.cum_xx feed_dict[model.cum_xy_pl] = model.cum_xy feed_dict[model.sigma_prior_pl] = model.sigma_prior feed_dict[model.mu_prior_pl] = model.mu_prior for it in range(self.train_policy_iterations): batch = memory.sample(self.train_policy_batch_size) states = np.stack([b[0] for b in batch], axis=0) feed_dict[self.states] = states mus0, sigmas0, mus1, sigmas1, mus2, sigmas2, next_states, loss, _ = sess.run( [ self.mus0, self.sigmas0, self.mus1, self.sigmas1, self.mus2, self.sigmas2, self.next_states, self.loss, self.opt ], feed_dict=feed_dict) if loss > 1000.: print next_states ''' assert len(mus0) == len(sigmas0) assert len(mus0) == len(mus1) assert len(mus0) == len(sigmas1) assert len(mus0) == len(mus2) assert len(mus0) == len(sigmas2) ''' ''' for mu0, sigma0, mu1, sigma1, mu2, sigma2, ii in zip(mus0, sigmas0, mus1, sigmas1, mus2, sigmas2, range(len(mus0))): try: np.testing.assert_almost_equal(sigma1, sigma2, decimal=4) except: print ii, 'here0' for i in range(len(sigma1)): for j in range(len(sigma1[i])): print sigma1[i, j], sigma2[i, j] exit() try: np.testing.assert_almost_equal(mu1, mu2, decimal=4) except: print ii, 'here3', for i in range(len(mu1)): print mu1[i], mu2[i] exit() try: np.testing.assert_almost_equal(mu0, mu1, decimal=4) except: print ii, 'here1', for i in range(len(mu0)): print mu0[i], mu1[i] exit() try: np.testing.assert_almost_equal(mu0, mu2, decimal=4) except: print ii, 'here2', for i in range(len(m0)): print m0[i], m2[i] exit() try: np.testing.assert_almost_equal(sigma0, sigma1, decimal=4) except: print ii, 'here4', for i in range(len(sigma0)): for j in range(len(sigma0[i])): print sigma0[i, j], sigma1[i, j] exit() try: np.testing.assert_almost_equal(sigma0, sigma2, decimal=4) except: print ii, 'here5', for i in range(len(sigma0)): for j in range(len(sigma0[i])): print sigma0[i, j], sigma2[i, j] exit() ''' print 'iteration:', it, 'loss:', loss, self.string, len(mus0) ''' try: mus0, sigmas0, mus1, sigmas1, mus2, sigmas2, next_states, loss, _ = sess.run([self.mus0, self.sigmas0, self.mus1, self.sigmas1, self.mus2, self.sigmas2, self.next_states, self.loss, self.opt], feed_dict=feed_dict) assert len(mus0) == len(sigmas0) assert len(mus0) == len(mus1) assert len(mus0) == len(sigmas1) assert len(mus0) == len(mus2) assert len(mus0) == len(sigmas2) for mu0, sigma0, mu1, sigma1, mu2, sigma2 in zip(mus0, sigmas0, mus1, sigmas1, mus2, sigmas2): np.testing.assert_almost_equal(mu0, mu1) np.testing.assert_almost_equal(mu0, mu2) np.testing.assert_almost_equal(mu1, mu2) np.testing.assert_almost_equal(sigma0, sigma1) np.testing.assert_almost_equal(sigma0, sigma2) np.testing.assert_almost_equal(sigma1, sigma2) if loss > 1000.: print next_states print 'iteration:', it, 'loss:', loss, self.string except: print 'training step failed.' ''' def build_policy(self, states): assert states.shape.as_list() == [None, self.state_dim] #Fully connected layer 1 fc1 = slim.fully_connected(states, 256, activation_fn=tf.nn.relu, scope=self.policy_scope + '/fc1', reuse=self.policy_reuse_vars) #Fully connected layer 2 fc2 = slim.fully_connected(fc1, 256, activation_fn=tf.nn.relu, scope=self.policy_scope + '/fc2', reuse=self.policy_reuse_vars) #Output layer output = slim.fully_connected(fc2, self.action_dim, activation_fn=tf.nn.tanh, scope=self.policy_scope + '/output', reuse=self.policy_reuse_vars) #Apply action bounds np.testing.assert_array_equal(-self.action_bound_low, self.action_bound_high) action_bound = tf.constant(self.action_bound_high, dtype=tf.float64) policy = tf.multiply(output, action_bound) #Change flag self.policy_reuse_vars = True return policy def build_policy2(self, states): try: self.policy except: self.idx = 0 self.policy = tf.placeholder(shape=[self.unroll_steps, 1], dtype=tf.float64) action = self.policy[self.idx:self.idx + 1, ...] tile_size = tf.shape(states)[0] action_tiled = tf.tile(action, [tile_size, 1]) self.idx += 1 return action_tiled
def unroll(self, seed_states): assert seed_states.shape.as_list() == [None, self.state_dim] no_samples = self.no_samples unroll_steps = self.unroll_steps #self.reward_model = real_env_pendulum_reward()#Use true model. self.reward_model = ANN(self.state_dim + self.action_dim, 1) self.placeholders_reward = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope) ] self.assign_ops = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope), self.placeholders_reward) ] states = tf.expand_dims(seed_states, axis=1) states = tf.tile(states, [1, no_samples, 1]) states = tf.reshape(states, shape=[-1, self.state_dim]) self.mus0 = [] self.sigmas0 = [] self.mus1 = [] self.sigmas1 = [] self.mus2 = [] self.sigmas2 = [] costs = [] self.next_states = [] #ns = [] #bs = [] for unroll_step in range(unroll_steps): print 'unrolling:', unroll_step if self.debugging_plot == True: actions = self.build_policy2(states) else: actions = self.build_policy(states) # Reward rewards = (self.discount_factor** unroll_step) * self.reward_model.build(states, actions) rewards = tf.reshape(tf.squeeze(rewards, axis=-1), shape=[-1, no_samples]) costs.append(-rewards) states_actions = tf.concat([states, actions], axis=-1) mus, sigmas = zip(*[ self.mu_sigma(self.cum_xx[y], self.cum_xy[y], self.models[y].s, self.models[y].noise_sd) for y in range(self.y_dim) ]) bases = [ model.approx_rbf_kern_basis(states_actions) for model in self.models ] #bs.append(bases) mu_pred, sigma_pred = [ tf.concat(e, axis=-1) for e in zip(*[ self.prediction(mu, sigma, basis, model.noise_sd) for mu, sigma, basis, model in zip(mus, sigmas, bases, self.models) ]) ] self.mus0.append(mu_pred) self.sigmas0.append(sigma_pred) self.get_next_states(states_actions) self.get_next_states2(states_actions) next_states = tfd.MultivariateNormalDiag( loc=mu_pred, scale_diag=tf.sqrt(sigma_pred)).sample() #ns.append(tf.split(next_states, self.y_dim, axis=-1)) self.next_states.append( tf.reshape(next_states, shape=[-1, no_samples, self.state_dim])) for y in range(self.y_dim): self.update_posterior(bases[y], next_states[..., y:y + 1], y) states = next_states if self.debugging_plot == False: print 'here1' costs = tf.stack(costs, axis=-1) print 'here2' self.loss = tf.reduce_mean( tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1)) print 'here3' self.opt = tf.train.AdamOptimizer().minimize( self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy_scope')) print 'here4' self.string = 'unroll'
def __init__(self, state_dim, action_dim, action_bound_high, \ action_bound_low, unroll_length, discount_factor, \ gradient_descent_steps, scope): self.state_dim = state_dim self.action_dim = action_dim self.action_bound_high = action_bound_high self.action_bound_low = action_bound_low self.unroll_length = unroll_length self.discount_factor = discount_factor self.gradient_descent_steps = gradient_descent_steps self.scope = scope #Make sure bounds are same (assumption can be relaxed later) np.testing.assert_array_equal(-self.action_bound_low, self.action_bound_high) #Flags self.policy_reuse_vars = None ''' self.reward_model = ANN(self.state_dim+self.action_dim, 1) self.placeholders_reward = [tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope)] self.assign_ops0 = [v.assign(pl) for v, pl in zip(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope), self.placeholders_reward)] ''' #self.reward_model = real_env_pendulum_reward() self.reward_model = mountain_car_continuous_reward_function() #self.state_model = real_env_pendulum_state() #self.state_model = mountain_car_continuous_state_function() self.state_model = ANN(self.state_dim + self.action_dim, self.state_dim) self.placeholders_state = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.state_model.scope) ] self.assign_ops1 = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self. state_model.scope), self.placeholders_state) ] #Build computational graph (i.e., unroll policy) #self.states = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float32) self.states = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float64) self.action = self.build_policy(self.states) state = self.states action = self.build_policy(state) rewards = [] for i in range(self.unroll_length): print i #reward = pow(self.discount_factor, i) * self.reward_model.build(state, action) #reward = pow(self.discount_factor, i) * self.reward_model.step_tf(state, action) reward = pow(self.discount_factor, i) * self.reward_model.sigmoid_approx(state, action) rewards.append(reward) state = self.state_model.build(state, action) #state = self.state_model.step_tf(state, action) action = self.build_policy(state) rewards = tf.reduce_sum(tf.stack(rewards, axis=-1), axis=-1) print 'here0' self.loss = -tf.reduce_mean(tf.reduce_sum(rewards, axis=-1)) print 'here1' self.opt = tf.train.AdamOptimizer().minimize( self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)) print 'here2'
class direct_policy_search: def __init__(self, state_dim, action_dim, action_bound_high, \ action_bound_low, unroll_length, discount_factor, \ gradient_descent_steps, scope): self.state_dim = state_dim self.action_dim = action_dim self.action_bound_high = action_bound_high self.action_bound_low = action_bound_low self.unroll_length = unroll_length self.discount_factor = discount_factor self.gradient_descent_steps = gradient_descent_steps self.scope = scope #Make sure bounds are same (assumption can be relaxed later) np.testing.assert_array_equal(-self.action_bound_low, self.action_bound_high) #Flags self.policy_reuse_vars = None ''' self.reward_model = ANN(self.state_dim+self.action_dim, 1) self.placeholders_reward = [tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope)] self.assign_ops0 = [v.assign(pl) for v, pl in zip(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope), self.placeholders_reward)] ''' #self.reward_model = real_env_pendulum_reward() self.reward_model = mountain_car_continuous_reward_function() #self.state_model = real_env_pendulum_state() #self.state_model = mountain_car_continuous_state_function() self.state_model = ANN(self.state_dim + self.action_dim, self.state_dim) self.placeholders_state = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.state_model.scope) ] self.assign_ops1 = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self. state_model.scope), self.placeholders_state) ] #Build computational graph (i.e., unroll policy) #self.states = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float32) self.states = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float64) self.action = self.build_policy(self.states) state = self.states action = self.build_policy(state) rewards = [] for i in range(self.unroll_length): print i #reward = pow(self.discount_factor, i) * self.reward_model.build(state, action) #reward = pow(self.discount_factor, i) * self.reward_model.step_tf(state, action) reward = pow(self.discount_factor, i) * self.reward_model.sigmoid_approx(state, action) rewards.append(reward) state = self.state_model.build(state, action) #state = self.state_model.step_tf(state, action) action = self.build_policy(state) rewards = tf.reduce_sum(tf.stack(rewards, axis=-1), axis=-1) print 'here0' self.loss = -tf.reduce_mean(tf.reduce_sum(rewards, axis=-1)) print 'here1' self.opt = tf.train.AdamOptimizer().minimize( self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)) print 'here2' def act(self, sess, states): states = np.atleast_2d(states) #print sess.run(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)) action = sess.run(self.action, feed_dict={self.states: states}) return action[0] def train(self, sess, states): for _ in range(self.gradient_descent_steps): loss, _ = sess.run([self.loss, self.opt], feed_dict={self.states: states}) #asin1, asin2, loss, _ = sess.run([self.asin1, self.asin2, self.loss, self.opt], feed_dict={self.states:states}) def build_policy(self, states): assert states.shape.as_list() == [None, self.state_dim] #Fully connected layer 1 fc1 = slim.fully_connected(states, 256, activation_fn=tf.nn.relu, scope=self.scope + '/fc1', reuse=self.policy_reuse_vars) fc2 = slim.fully_connected(fc1, 256, activation_fn=tf.nn.relu, scope=self.scope + '/fc2', reuse=self.policy_reuse_vars) #Output layer output = slim.fully_connected(fc2, self.action_dim, activation_fn=tf.nn.tanh, scope=self.scope + '/output', reuse=self.policy_reuse_vars) #Apply action bounds #action_bound = tf.constant(self.action_bound_high, dtype=tf.float32) action_bound = tf.constant(self.action_bound_high, dtype=tf.float64) policy = tf.multiply(output, action_bound) #Change flag self.policy_reuse_vars = True return policy