def test_product_space_unflatten_n(): space = Product([Discrete(3), Discrete(3)]) np.testing.assert_array_equal(space.flatten((2, 2)), space.flatten_n([(2, 2)])[0]) np.testing.assert_array_equal( space.unflatten(space.flatten((2, 2))), space.unflatten_n(space.flatten_n([(2, 2)]))[0] )
def action_space(self): return Product([ Discrete(11), Discrete(11), Discrete(11), Discrete(2), Discrete(2) ])
def action_space(self): """ Returns a Space object :rtype: rllab.spaces.base.Space Action space is [Agent's desired X,Y loc represented in cartesian product form] """ return Discrete(5)
def observation_space(self): """ Returns a Space object :rtype: rllab.spaces.base.Space State is [Agent X loc, Agent Y loc, Fire 1, Fire 2,... Fire N Alive?] Represented in cartesian product form as a Discrete space """ return Discrete(self.n_row * self.n_col * 2**self.n_fires)
def __init__(self, transition_matrix, reward, init_state, terminate_on_reward=False): super(DiscreteEnv, self).__init__() dX, dA, dXX = transition_matrix.shape self.nstates = dX self.nactions = dA self.transitions = transition_matrix self.init_state = init_state self.reward = reward self.terminate_on_reward = terminate_on_reward self.__observation_space = Box(0, 1, shape=(self.nstates,)) #max_A = 0 #for trans in self.transitions: # max_A = max(max_A, len(self.transitions[trans])) self.__action_space = Discrete(dA)
def observation_space(self): """ Returns a Space object """ return Product(Discrete(52), # Player card 1 Discrete(52), # Player card 2 Discrete(52), # Player card 3 Discrete(52), # Dealer card 1 Discrete(52), # Dealer card 2 Discrete(52)) # Dealer card 3
def action_space(self): return Discrete(11 * 11 * 11 * 2 * 2)
def __init__(self, ns): self.agent_num = len(ns) self.agent_spaces = np.array([Discrete(n) for n in ns])
def action_space(self): return Discrete(len(self.policies))
def observation_space(self): return Discrete(4)
def action_space(self): return Discrete(self.num_action)
def action_space(self): return Discrete(self.domain.actions_num)
def action_space(self): return Discrete(len(self._action_set))
def action_space(self): return Discrete(len(self._controllers))
def action_space(self): return Discrete(n=self._k)
class CategoricalMLPRegressor(LayersPowered, Serializable): """ A class for performing regression (or classification, really) by fitting a categorical distribution to the outputs. Assumes that the outputs will be always a one hot vector. """ def __init__( self, name, input_shape, output_dim, prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, optimizer=None, tr_optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, no_initial_trust_region=True, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) with tf.variable_scope(name): if optimizer is None: optimizer = LbfgsOptimizer(name="optimizer") if tr_optimizer is None: tr_optimizer = ConjugateGradientOptimizer() self.input_dim = input_shape[0] self.observation_space = Discrete(self.input_dim) self.action_space = Discrete(output_dim) self.output_dim = output_dim self.optimizer = optimizer self.tr_optimizer = tr_optimizer if prob_network is None: prob_network = MLP( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, name="prob_network" ) l_prob = prob_network.output_layer LayersPowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys") old_prob_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="old_prob") x_mean_var = tf.get_variable( name="x_mean", shape=(1,) + input_shape, initializer=tf.constant_initializer(0., dtype=tf.float32) ) x_std_var = tf.get_variable( name="x_std", shape=(1,) + input_shape, initializer=tf.constant_initializer(1., dtype=tf.float32) ) self.x_mean_var = x_mean_var self.x_std_var = x_std_var normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical(output_dim) mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars)) loss = - tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = tensor_utils.to_onehot_sym(tf.argmax(prob_var, axis=1), output_dim) self.prob_network = prob_network self.f_predict = tensor_utils.compile_function([xs_var], predicted) self.f_prob = tensor_utils.compile_function([xs_var], prob_var) self.l_prob = l_prob self.optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var]) self.tr_optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var, old_prob_var], leq_constraint=(mean_kl, step_size) ) self.use_trust_region = use_trust_region self.name = name self.normalize_inputs = normalize_inputs self.x_mean_var = x_mean_var self.x_std_var = x_std_var self.first_optimized = not no_initial_trust_region def fit(self, xs, ys): if self.normalize_inputs: # recompute normalizing constants for inputs new_mean = np.mean(xs, axis=0, keepdims=True) new_std = np.std(xs, axis=0, keepdims=True) + 1e-8 tf.get_default_session().run(tf.group( tf.assign(self.x_mean_var, new_mean), tf.assign(self.x_std_var, new_std), )) if self.use_trust_region and self.first_optimized: old_prob = self.f_prob(xs) inputs = [xs, ys, old_prob] optimizer = self.tr_optimizer else: inputs = [xs, ys] optimizer = self.optimizer loss_before = optimizer.loss(inputs) if self.name: prefix = self.name + "_" else: prefix = "" logger.record_tabular(prefix + 'LossBefore', loss_before) optimizer.optimize(inputs) loss_after = optimizer.loss(inputs) logger.record_tabular(prefix + 'LossAfter', loss_after) logger.record_tabular(prefix + 'dLoss', loss_before - loss_after) self.first_optimized = True def predict(self, xs): return self.f_predict(np.asarray(xs)) def predict_log_likelihood(self, xs, ys): prob = self.f_prob(np.asarray(xs)) return self._dist.log_likelihood(np.asarray(ys), dict(prob=prob)) def dist_info_sym(self, x_var): normalized_xs_var = (x_var - self.x_mean_var) / self.x_std_var prob = L.get_output(self.l_prob, {self.prob_network.input_layer: normalized_xs_var}) return dict(prob=prob) def log_likelihood_sym(self, x_var, y_var): normalized_xs_var = (x_var - self.x_mean_var) / self.x_std_var prob = L.get_output(self.l_prob, {self.prob_network.input_layer: normalized_xs_var}) return self._dist.log_likelihood_sym(y_var, dict(prob=prob)) def get_param_values(self, **tags): return LayersPowered.get_param_values(self, **tags) def set_param_values(self, flattened_params, **tags): return LayersPowered.set_param_values(self, flattened_params, **tags) def get_action(self, observation): # observation = np.reshape(observation,(1,self.input_dim)) flat_obs = self.observation_space.flatten(observation) prob = self.f_prob([flat_obs])[0] # action = self.f_predict(observation) action = self.action_space.weighted_sample(prob) # print(self.name,' :',action,' ',prob) return action, dict(prob=prob) # def get_actions(self, observations): # probs = self.f_prob(observations) # actions = self.f_predict(observations) # return actions, dict(prob=probs) def reset(self): #do nothing return def terminate(self): #do nothing return def reload_initialize(self,sess): sess.run(self.x_mean_var.initializer) sess.run(self.x_std_var.initializer)
def __init__( self, name, input_shape, output_dim, prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, optimizer=None, tr_optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, no_initial_trust_region=True, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) with tf.variable_scope(name): if optimizer is None: optimizer = LbfgsOptimizer(name="optimizer") if tr_optimizer is None: tr_optimizer = ConjugateGradientOptimizer() self.input_dim = input_shape[0] self.observation_space = Discrete(self.input_dim) self.action_space = Discrete(output_dim) self.output_dim = output_dim self.optimizer = optimizer self.tr_optimizer = tr_optimizer if prob_network is None: prob_network = MLP( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, name="prob_network" ) l_prob = prob_network.output_layer LayersPowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys") old_prob_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="old_prob") x_mean_var = tf.get_variable( name="x_mean", shape=(1,) + input_shape, initializer=tf.constant_initializer(0., dtype=tf.float32) ) x_std_var = tf.get_variable( name="x_std", shape=(1,) + input_shape, initializer=tf.constant_initializer(1., dtype=tf.float32) ) self.x_mean_var = x_mean_var self.x_std_var = x_std_var normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical(output_dim) mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars)) loss = - tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = tensor_utils.to_onehot_sym(tf.argmax(prob_var, axis=1), output_dim) self.prob_network = prob_network self.f_predict = tensor_utils.compile_function([xs_var], predicted) self.f_prob = tensor_utils.compile_function([xs_var], prob_var) self.l_prob = l_prob self.optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var]) self.tr_optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var, old_prob_var], leq_constraint=(mean_kl, step_size) ) self.use_trust_region = use_trust_region self.name = name self.normalize_inputs = normalize_inputs self.x_mean_var = x_mean_var self.x_std_var = x_std_var self.first_optimized = not no_initial_trust_region
def action_space(self): return Discrete(self.problem.num_actions)
def test_product_space(): _ = Product([Discrete(3), Discrete(2)]) product_space = Product(Discrete(3), Discrete(2)) sample = product_space.sample() assert product_space.contains(sample)
def action_space(self): # Actions are Fire to go to or STAY return Discrete( NUM_FIRES + 1 )
def action_space(self): return Discrete(2)
def observation_space(self): return Discrete(self.n_row * self.n_col)
def action_space(self): # Actions are Fire to go to or STAY return Discrete( 5 + # Fires 1 ) # stay
def action_space(self): """ Returns a Space object """ return Discrete(2)