def unroll2(self, seed_states): assert seed_states.shape.as_list() == [None, self.state_dim] no_samples = self.no_samples unroll_steps = self.unroll_steps #self.reward_model = real_env_pendulum_reward()#Use true model. self.reward_model = ANN(self.state_dim + self.action_dim, 1) self.placeholders_reward = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope) ] self.assign_ops = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope), self.placeholders_reward) ] states = tf.expand_dims(seed_states, axis=1) states = tf.tile(states, [1, no_samples, 1]) states = tf.reshape(states, shape=[-1, self.state_dim]) costs = [] self.next_states = [] for unroll_step in range(unroll_steps): actions = self.build_policy(states) rewards = (self.discount_factor** unroll_step) * self.reward_model.build(states, actions) rewards = tf.reshape(tf.squeeze(rewards, axis=-1), shape=[-1, no_samples]) costs.append(-rewards) states_actions = tf.concat([states, actions], axis=-1) next_states = self.get_next_states2(states_actions) self.next_states.append(next_states) states = next_states costs = tf.stack(costs, axis=-1) self.loss = tf.reduce_mean( tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1)) self.opt = tf.train.AdamOptimizer().minimize( self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy_scope'))
def __init__(self, environment, x_dim, y_dim, state_dim, action_dim, observation_space_low, observation_space_high, action_space_low, action_space_high, unroll_steps, no_samples, discount_factor, random_matrices, biases, basis_dims, hidden_dim=32, learn_reward=0, use_mean_reward=0, update_hyperstate=1, policy_use_hyperstate=1, learn_diff=0): #assert environment in ['Pendulum-v0', 'MountainCarContinuous-v0'] assert x_dim == state_dim + action_dim assert len(action_space_low.shape) == 1 np.testing.assert_equal(-action_space_low, action_space_high) self.environment = environment self.x_dim = x_dim self.y_dim = y_dim self.state_dim = state_dim self.action_dim = action_dim self.observation_space_low = observation_space_low self.observation_space_high = observation_space_high self.action_space_low = action_space_low self.action_space_high = action_space_high self.unroll_steps = unroll_steps self.no_samples = no_samples self.discount_factor = discount_factor self.random_matrices = random_matrices self.biases = biases self.basis_dims = basis_dims self.hidden_dim = hidden_dim self.learn_reward = learn_reward self.use_mean_reward = use_mean_reward self.update_hyperstate = update_hyperstate self.policy_use_hyperstate = policy_use_hyperstate self.learn_diff = learn_diff if self.environment == 'Pendulum-v0' and self.learn_reward == 0: #self.reward_function = real_env_pendulum_reward() self.reward_function = ANN(self.state_dim + self.action_dim, 1) self.placeholders_reward = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_function.scope) ] self.assign_ops0 = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_function.scope), self.placeholders_reward) ] elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0: self.reward_function = mountain_car_continuous_reward_function() #self.hyperstate_dim = sum([(basis_dim*(basis_dim+1))/2 + basis_dim for basis_dim in self.basis_dims]) self.hyperstate_dim = sum( [basis_dim * (basis_dim + 1) for basis_dim in self.basis_dims]) self.random_projection_matrix = np.random.normal( loc=0., scale=1. / np.sqrt(self.state_dim), size=[self.hyperstate_dim, self.state_dim]) input_dim = self.state_dim if self.policy_use_hyperstate == 1: input_dim *= 2 self.w1 = np.concatenate([ np.random.normal(size=[input_dim, self.hidden_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim]) ], axis=0) self.w2 = np.concatenate([ np.random.normal(size=[self.hidden_dim, self.hidden_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim]) ], axis=0) self.w3 = np.concatenate([ np.random.normal(size=[self.hidden_dim, self.action_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.action_dim]) ], axis=0) self.thetas = self._pack([self.w1, self.w2, self.w3]) self.sizes = [[input_dim + 1, self.hidden_dim], [self.hidden_dim + 1, self.hidden_dim], [self.hidden_dim + 1, self.action_dim]] w1, w2, w3 = self._unpack(self.thetas, self.sizes) np.testing.assert_equal(w1, self.w1) np.testing.assert_equal(w2, self.w2) np.testing.assert_equal(w3, self.w3)
def unroll(self, seed_states): assert seed_states.shape.as_list() == [None, self.state_dim] no_samples = self.no_samples unroll_steps = self.unroll_steps #self.reward_model = real_env_pendulum_reward()#Use true model. self.reward_model = ANN(self.state_dim + self.action_dim, 1) self.placeholders_reward = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope) ] self.assign_ops = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope), self.placeholders_reward) ] states = tf.expand_dims(seed_states, axis=1) states = tf.tile(states, [1, no_samples, 1]) states = tf.reshape(states, shape=[-1, self.state_dim]) self.mus0 = [] self.sigmas0 = [] self.mus1 = [] self.sigmas1 = [] self.mus2 = [] self.sigmas2 = [] costs = [] self.next_states = [] #ns = [] #bs = [] for unroll_step in range(unroll_steps): print 'unrolling:', unroll_step if self.debugging_plot == True: actions = self.build_policy2(states) else: actions = self.build_policy(states) # Reward rewards = (self.discount_factor** unroll_step) * self.reward_model.build(states, actions) rewards = tf.reshape(tf.squeeze(rewards, axis=-1), shape=[-1, no_samples]) costs.append(-rewards) states_actions = tf.concat([states, actions], axis=-1) mus, sigmas = zip(*[ self.mu_sigma(self.cum_xx[y], self.cum_xy[y], self.models[y].s, self.models[y].noise_sd) for y in range(self.y_dim) ]) bases = [ model.approx_rbf_kern_basis(states_actions) for model in self.models ] #bs.append(bases) mu_pred, sigma_pred = [ tf.concat(e, axis=-1) for e in zip(*[ self.prediction(mu, sigma, basis, model.noise_sd) for mu, sigma, basis, model in zip(mus, sigmas, bases, self.models) ]) ] self.mus0.append(mu_pred) self.sigmas0.append(sigma_pred) self.get_next_states(states_actions) self.get_next_states2(states_actions) next_states = tfd.MultivariateNormalDiag( loc=mu_pred, scale_diag=tf.sqrt(sigma_pred)).sample() #ns.append(tf.split(next_states, self.y_dim, axis=-1)) self.next_states.append( tf.reshape(next_states, shape=[-1, no_samples, self.state_dim])) for y in range(self.y_dim): self.update_posterior(bases[y], next_states[..., y:y + 1], y) states = next_states if self.debugging_plot == False: print 'here1' costs = tf.stack(costs, axis=-1) print 'here2' self.loss = tf.reduce_mean( tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1)) print 'here3' self.opt = tf.train.AdamOptimizer().minimize( self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy_scope')) print 'here4' self.string = 'unroll'
def __init__(self, state_dim, action_dim, action_bound_high, \ action_bound_low, unroll_length, discount_factor, \ gradient_descent_steps, scope): self.state_dim = state_dim self.action_dim = action_dim self.action_bound_high = action_bound_high self.action_bound_low = action_bound_low self.unroll_length = unroll_length self.discount_factor = discount_factor self.gradient_descent_steps = gradient_descent_steps self.scope = scope #Make sure bounds are same (assumption can be relaxed later) np.testing.assert_array_equal(-self.action_bound_low, self.action_bound_high) #Flags self.policy_reuse_vars = None ''' self.reward_model = ANN(self.state_dim+self.action_dim, 1) self.placeholders_reward = [tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope)] self.assign_ops0 = [v.assign(pl) for v, pl in zip(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.reward_model.scope), self.placeholders_reward)] ''' #self.reward_model = real_env_pendulum_reward() self.reward_model = mountain_car_continuous_reward_function() #self.state_model = real_env_pendulum_state() #self.state_model = mountain_car_continuous_state_function() self.state_model = ANN(self.state_dim + self.action_dim, self.state_dim) self.placeholders_state = [ tf.placeholder(shape=v.shape, dtype=tf.float64) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.state_model.scope) ] self.assign_ops1 = [ v.assign(pl) for v, pl in zip( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self. state_model.scope), self.placeholders_state) ] #Build computational graph (i.e., unroll policy) #self.states = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float32) self.states = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float64) self.action = self.build_policy(self.states) state = self.states action = self.build_policy(state) rewards = [] for i in range(self.unroll_length): print i #reward = pow(self.discount_factor, i) * self.reward_model.build(state, action) #reward = pow(self.discount_factor, i) * self.reward_model.step_tf(state, action) reward = pow(self.discount_factor, i) * self.reward_model.sigmoid_approx(state, action) rewards.append(reward) state = self.state_model.build(state, action) #state = self.state_model.step_tf(state, action) action = self.build_policy(state) rewards = tf.reduce_sum(tf.stack(rewards, axis=-1), axis=-1) print 'here0' self.loss = -tf.reduce_mean(tf.reduce_sum(rewards, axis=-1)) print 'here1' self.opt = tf.train.AdamOptimizer().minimize( self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)) print 'here2'