def __init__(self, policy_name, env_spec, latent_sampler, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, prob_network=None): Serializable.quick_init(self, locals()) name = policy_name self.latent_sampler = latent_sampler with tf.variable_scope(name): if prob_network is None: input_dim = env_spec.observation_space.flat_dim + self.latent_sampler.dim l_input = L.InputLayer(shape=(None, input_dim), name="input") prob_network = MLP(input_layer=l_input, output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, name="prob_network") self._output = prob_network.output self._inputs = prob_network.input_var super(CategoricalLatentVarMLPPolicy, self).__init__(name=name, env_spec=env_spec, prob_network=prob_network)
def __init__(self, name, env_spec, latent_sampler, hidden_sizes=(32, 32), std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=tf.nn.tanh, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None): Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) self.latent_sampler = latent_sampler with tf.variable_scope(name): obs_dim = env_spec.observation_space.flat_dim + self.latent_sampler.dim action_dim = env_spec.action_space.flat_dim # create networks mean_network = MLP( name="mean_network", input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) std_network = MLP( name="std_network", input_shape=(obs_dim, ), input_layer=mean_network.input_layer, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, ) super(GaussianLatentVarMLPPolicy, self).__init__(name=name, env_spec=env_spec, mean_network=mean_network, std_network=std_network)
def get_policy_network(env): policy_network = MLP( name='mean_network', input_shape=env.observation_space.shape, output_dim=env.action_space.flat_dim, hidden_sizes=(64, 32), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax, batch_normalization=False, ) return policy_network
def get_value_network(env): value_network = MLP( name='value_network', input_shape=env.observation_space.shape, output_dim=1, hidden_sizes=(32, ), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None, batch_normalization=False, ) return value_network
def get_policy_network(env): policy_network = MLP( name='mean_network', input_shape=env.observation_space.shape, output_dim=env.action_space.flat_dim, hidden_sizes=(64, 32), hidden_nonlinearity=get_nonlinearity_for_trpo(), output_nonlinearity=None, batch_normalization=False, ) return policy_network
def __init__( self, name, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=tf.nn.tanh, mean_network=None, ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) with tf.variable_scope(name): obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network if mean_network is None: mean_network = MLP( name="mean_network", input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) self._mean_network = mean_network l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var self._l_mean = l_mean action_var = L.get_output(self._l_mean, deterministic=True) LayersPowered.__init__(self, [l_mean]) super(DeterministicMLPPolicy, self).__init__(env_spec) self._f_actions = tensor_utils.compile_function( inputs=[obs_var], outputs=action_var, )
def _make_subnetwork(self, input_layer, dim_output, hidden_sizes, output_nonlinearity=tf.sigmoid, name="pred_network"): prob_network = MLP( # input_shape=(env_spec.observation_space.flat_dim,), output_dim=dim_output, hidden_sizes=hidden_sizes, hidden_nonlinearity=lrelu, output_nonlinearity=output_nonlinearity, name=name, input_layer=input_layer) return L.get_output( prob_network.output_layer), prob_network.output_layer
def __init__( self, name, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) with tf.variable_scope(name): if prob_network is None: prob_network = MLP( input_shape=(env_spec.observation_space.flat_dim,), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, name="prob_network", ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = tensor_utils.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer) ) self._dist = Categorical(env_spec.action_space.n) super(CategoricalMLPPolicy, self).__init__(env_spec) LayersPowered.__init__(self, [prob_network.output_layer])
def __init__(self, name, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, prob_network=None, bn=False): Serializable.quick_init(self, locals()) ## Apply MC Dropout on the MLP networks here with tf.variable_scope(name): if prob_network is None: prob_network = MLP( input_shape=(env_spec.observation_space.flat_dim, ), output_dim=env_spec.action_space.flat_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, # batch_normalization=True, name="prob_network", ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = tensor_utils.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer, deterministic=True)) self.prob_network = prob_network # Note the deterministic=True argument. It makes sure that when getting # actions from single observations, we do not update params in the # batch normalization layers. # TODO: this doesn't currently work properly in the tf version so we leave out batch_norm super(DeterministicMLPPolicy, self).__init__(env_spec) LayersPowered.__init__(self, [prob_network.output_layer])
def __init__(self, name, abstract_dim, hidden_sizes=(32), min_std=1e-6, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, optim=tf.train.AdamOptimizer(learning_rate=0.001)): self.obs_dim = abstract_dim self.min_std = min_std self.distribution = DiagonalGaussian(self.obs_dim) with tf.variable_scope(name): self.net = MLP( name="mu_log_sigma", input_shape=self.obs_dim, output_dim=2 * self.obs_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) self.obs_var = self.net.input_layer.input_var self.output = L.get_output(self.net.output_layer, self.obs_var) self.mu, unstable_log_sigma = tf.split(self.output, [self.obs_dim, self.obs_dim], 1) self.log_sigma = tf.maximum(unstable_log_sigma, self.min_std) self.nexts = tf.placeholder(tf.float32, shape=(None, self.obs_dim)) self.loss = -self.distribution.log_likelihood( self.nexts, dist_info=dict(mean=self.mu, log_stds=self.log_sigma)) self.optimizer = optim self.train_op = optim.minimize(self.loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer())
def _make_subnetwork(self, input_layer, dim_output, hidden_sizes, output_nonlinearity=tf.sigmoid, hidden_nonlinearity=tf.nn.tanh, name="pred_network", conv_filters=None, conv_filter_sizes=None, conv_strides=None, conv_pads=None, input_shape=None): if conv_filters is not None: input_layer = L.reshape(input_layer, ([0], ) + input_shape, name="reshape_input") prob_network = ConvNetwork(input_shape=input_shape, output_dim=dim_output, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name=name, input_layer=input_layer, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads) else: prob_network = MLP(output_dim=dim_output, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name=name, input_layer=input_layer) return prob_network.output_layer
def __init__(self, name, env_spec, num_models=5, hidden_sizes=(512, 512), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None, batch_size=500, step_size=0.001, weight_normalization=False, normalize_input=True, optimizer=tf.train.AdamOptimizer, valid_split_ratio=0.2, rolling_average_persitency=0.99): Serializable.quick_init(self, locals()) self.normalization = None self.normalize_input = normalize_input self.next_batch = None self.valid_split_ratio = valid_split_ratio self.rolling_average_persitency = rolling_average_persitency self.batch_size = batch_size self.step_size = step_size self.num_models = num_models self.name = name # determine dimensionality of state and action space self.obs_space_dims = obs_space_dims = env_spec.observation_space.shape[ 0] self.action_space_dims = action_space_dims = env_spec.action_space.shape[ 0] """ computation graph for training and simple inference """ with tf.variable_scope(name): # placeholders self.obs_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) self.act_ph = tf.placeholder(tf.float32, shape=(None, action_space_dims)) self.delta_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) # concatenate action and observation --> NN input self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1) # create MLP mlps = [] delta_preds = [] self.obs_next_pred = [] for i in range(num_models): with tf.variable_scope('model_{}'.format(i)): mlp = MLP(name, obs_space_dims, hidden_sizes, hidden_nonlinearity, output_nonlinearity, input_var=self.nn_input, input_shape=(obs_space_dims + action_space_dims, ), weight_normalization=weight_normalization) mlps.append(mlp) delta_preds.append(mlp.output) self.delta_pred = tf.stack( delta_preds, axis=2) # shape: (batch_size, ndim_obs, n_models) # define loss and train_op self.loss = tf.reduce_mean( (self.delta_ph[:, :, None] - self.delta_pred)**2) self.optimizer = optimizer(self.step_size) self.train_op = self.optimizer.minimize(self.loss) # tensor_utils self.f_delta_pred = tensor_utils.compile_function( [self.obs_ph, self.act_ph], self.delta_pred) """ computation graph for inference where each of the models receives a different batch""" with tf.variable_scope(name, reuse=True): # placeholders self.obs_model_batches_stack_ph = tf.placeholder( tf.float32, shape=(None, obs_space_dims)) self.act_model_batches_stack_ph = tf.placeholder( tf.float32, shape=(None, action_space_dims)) self.delta_model_batches_stack_ph = tf.placeholder( tf.float32, shape=(None, obs_space_dims)) # split stack into the batches for each model --> assume each model receives a batch of the same size self.obs_model_batches = tf.split(self.obs_model_batches_stack_ph, self.num_models, axis=0) self.act_model_batches = tf.split(self.act_model_batches_stack_ph, self.num_models, axis=0) self.delta_model_batches = tf.split( self.delta_model_batches_stack_ph, self.num_models, axis=0) # reuse previously created MLP but each model receives its own batch delta_preds = [] self.obs_next_pred = [] self.loss_model_batches = [] self.train_op_model_batches = [] for i in range(num_models): with tf.variable_scope('model_{}'.format(i), reuse=True): # concatenate action and observation --> NN input nn_input = tf.concat( [self.obs_model_batches[i], self.act_model_batches[i]], axis=1) mlp = MLP(name, obs_space_dims, hidden_sizes, hidden_nonlinearity, output_nonlinearity, input_var=nn_input, input_shape=(obs_space_dims + action_space_dims, ), weight_normalization=weight_normalization) delta_preds.append(mlp.output) loss = tf.reduce_mean( (self.delta_model_batches[i] - mlp.output)**2) self.loss_model_batches.append(loss) self.train_op_model_batches.append( optimizer(self.step_size).minimize(loss)) self.delta_pred_model_batches_stack = tf.concat( delta_preds, axis=0) # shape: (batch_size_per_model*num_models, ndim_obs) # tensor_utils self.f_delta_pred_model_batches = tensor_utils.compile_function([ self.obs_model_batches_stack_ph, self.act_model_batches_stack_ph ], self.delta_pred_model_batches_stack) LayersPowered.__init__(self, [mlp.output_layer for mlp in mlps])
params_log_file = osp.join(log_dir, 'params.json') # logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode('all') logger.set_log_tabular_only(False) logger.push_prefix("[%s] " % exp_name) feature_network = MLP(name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=7, hidden_nonlinearity=tf.nn.tanh, hidden_sizes=(32, 32), output_nonlinearity=None) policy = GSMDPCategoricalGRUPolicy(feature_network=feature_network, env_spec=env.spec, name="policy") # policy = CategoricalMLPPolicy(env_spec=env.spec, name = "policy") baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=750, max_path_length=100000, batch_size=20000,
def rllab_envpolicy_parser(env, args): if isinstance(args, dict): args = tonamedtuple(args) env = RLLabEnv(env, mode=args.control) if args.algo[:2] == 'tf': env = TfEnv(env) # Policy if args.recurrent: if args.feature_net: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=args.feature_output, hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) elif args.conv: strides = tuple(args.conv_strides) chans = tuple(args.conv_channels) filts = tuple(args.conv_filters) assert len(strides) == len(chans) == len( filts), "strides, chans and filts not equal" # only discrete actions supported, should be straightforward to extend to continuous assert isinstance( env.spec.action_space, Discrete), "Only discrete action spaces support conv" feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=args.feature_output, conv_filters=chans, conv_filter_sizes=filts, conv_strides=strides, conv_pads=('VALID', ) * len(chans), hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None) else: feature_network = None if args.recurrent == 'gru': if isinstance(env.spec.action_space, Box): policy = GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden[0]), name='policy') elif isinstance(env.spec.action_space, Discrete): policy = CategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), name='policy', state_include_action=False if args.conv else True) else: raise NotImplementedError(env.spec.observation_space) elif args.recurrent == 'lstm': if isinstance(env.spec.action_space, Box): policy = GaussianLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='policy') elif isinstance(env.spec.action_space, Discrete): policy = CategoricalLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='policy') else: raise NotImplementedError(env.spec.action_space) else: raise NotImplementedError(args.recurrent) elif args.conv: strides = tuple(args.conv_strides) chans = tuple(args.conv_channels) filts = tuple(args.conv_filters) assert len(strides) == len(chans) == len( filts), "strides, chans and filts not equal" # only discrete actions supported, should be straightforward to extend to continuous assert isinstance( env.spec.action_space, Discrete), "Only discrete action spaces support conv" feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=env.spec.action_space.n, conv_filters=chans, conv_filter_sizes=filts, conv_strides=strides, conv_pads=('VALID', ) * len(chans), hidden_sizes=tuple(args.policy_hidden), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, prob_network=feature_network) else: if isinstance(env.spec.action_space, Box): policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), min_std=args.min_std, name='policy') elif isinstance(env.spec.action_space, Discrete): policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), name='policy') else: raise NotImplementedError(env.spec.action_space) elif args.algo[:2] == 'th': # Policy if args.recurrent: if args.feature_net: feature_network = thMLP( input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=args.feature_output, hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) else: feature_network = None if args.recurrent == 'gru': if isinstance(env.spec.observation_space, thBox): policy = thGaussianGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), ) elif isinstance(env.spec.observation_space, thDiscrete): policy = thCategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), ) else: raise NotImplementedError(env.spec.observation_space) # elif args.recurrent == 'lstm': # if isinstance(env.spec.action_space, thBox): # policy = thGaussianLSTMPolicy(env_spec=env.spec, # feature_network=feature_network, # hidden_dim=int(args.policy_hidden), # name='policy') # elif isinstance(env.spec.action_space, thDiscrete): # policy = thCategoricalLSTMPolicy(env_spec=env.spec, # feature_network=feature_network, # hidden_dim=int(args.policy_hidden), # name='policy') # else: # raise NotImplementedError(env.spec.action_space) else: raise NotImplementedError(args.recurrent) else: if args.algo == 'thddpg': assert isinstance(env.spec.action_space, thBox) policy = thDeterministicMLPPolicy( env_spec=env.spec, hidden_sizes=tuple(args.policy_hidden), ) else: if isinstance(env.spec.action_space, thBox): policy = thGaussianMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), min_std=args.min_std) elif isinstance(env.spec.action_space, thDiscrete): policy = thCategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), min_std=args.min_std) else: raise NotImplementedError(env.spec.action_space) if args.control == 'concurrent': return env, policies else: return env, policy
def __init__( self, name, input_shape, output_dim, # observation_space, mean_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, optimizer=None, use_trust_region=True, step_size=0.01, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_nonlinearity=None, normalize_inputs=True, normalize_outputs=True, subsample_factor=1.0 ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned. :param adaptive_std: Whether to make the std a function of the states. :param std_share_network: Whether to use the same network as the mean. :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if `std_share_network` is False. It defaults to the same architecture as the mean. :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network` is False. It defaults to the same non-linearity as the mean. """ Serializable.quick_init(self, locals()) with tf.variable_scope(name): if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer("optimizer") else: optimizer = LbfgsOptimizer("optimizer") self._optimizer = optimizer self._subsample_factor = subsample_factor if mean_network is None: mean_network = MLP( name="mean_network", input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=None, ) l_mean = mean_network.output_layer if adaptive_std: l_log_std = MLP( name="log_std_network", input_shape=input_shape, input_var=mean_network.input_layer.input_var, output_dim=output_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_nonlinearity, output_nonlinearity=None, ).output_layer else: l_log_std = L.ParamLayer( mean_network.input_layer, num_units=output_dim, param=tf.constant_initializer(np.log(init_std)), name="output_log_std", trainable=learn_std, ) LayersPowered.__init__(self, [l_mean, l_log_std]) xs_var = mean_network.input_layer.input_var ys_var = tf.placeholder(dtype=tf.float32, name="ys", shape=(None, output_dim)) old_means_var = tf.placeholder(dtype=tf.float32, name="ys", shape=(None, output_dim)) old_log_stds_var = tf.placeholder(dtype=tf.float32, name="old_log_stds", shape=(None, output_dim)) x_mean_var = tf.Variable( np.zeros((1,) + input_shape, dtype=np.float32), name="x_mean", ) x_std_var = tf.Variable( np.ones((1,) + input_shape, dtype=np.float32), name="x_std", ) y_mean_var = tf.Variable( np.zeros((1, output_dim), dtype=np.float32), name="y_mean", ) y_std_var = tf.Variable( np.ones((1, output_dim), dtype=np.float32), name="y_std", ) self.x_mean_var = x_mean_var self.x_std_var = x_std_var self.y_mean_var = y_mean_var self.y_std_var = y_std_var # self.observation_space = observation_space normalized_xs_var = (xs_var - x_mean_var) / x_std_var normalized_ys_var = (ys_var - y_mean_var) / y_std_var normalized_means_var = L.get_output(l_mean, {mean_network.input_layer: normalized_xs_var}) normalized_log_stds_var = L.get_output(l_log_std, {mean_network.input_layer: normalized_xs_var}) means_var = normalized_means_var * y_std_var + y_mean_var log_stds_var = normalized_log_stds_var + tf.log(y_std_var) normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var normalized_old_log_stds_var = old_log_stds_var - tf.log(y_std_var) dist = self._dist = DiagonalGaussian(output_dim) normalized_dist_info_vars = dict(mean=normalized_means_var, log_std=normalized_log_stds_var) mean_kl = tf.reduce_mean(dist.kl_sym( dict(mean=normalized_old_means_var, log_std=normalized_old_log_stds_var), normalized_dist_info_vars, )) loss = - tf.reduce_mean(dist.log_likelihood_sym(normalized_ys_var, normalized_dist_info_vars)) self._f_predict = tensor_utils.compile_function([xs_var], means_var) self._f_pdists = tensor_utils.compile_function([xs_var], [means_var, log_stds_var]) self._l_mean = l_mean self._l_log_std = l_log_std optimizer_args = dict( loss=loss, target=self, network_outputs=[normalized_means_var, normalized_log_stds_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [xs_var, ys_var, old_means_var, old_log_stds_var] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs self._mean_network = mean_network self._x_mean_var = x_mean_var self._x_std_var = x_std_var self._y_mean_var = y_mean_var self._y_std_var = y_std_var self.input_dim = input_shape[0] self.output_dim = output_dim
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.95) parser.add_argument('--gae_lambda', type=float, default=0.99) parser.add_argument('--reward_scale', type=float, default=1.0) parser.add_argument('--enable_obsnorm', action='store_true', default=False) parser.add_argument('--chunked', action='store_true', default=False) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--anneal_step_size', type=int, default=0) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--buffer_size', type=int, default=1) parser.add_argument('--radius', type=float, default=0.015) parser.add_argument('--n_evaders', type=int, default=10) parser.add_argument('--n_pursuers', type=int, default=8) parser.add_argument('--n_poison', type=int, default=10) parser.add_argument('--n_coop', type=int, default=4) parser.add_argument('--n_sensors', type=int, default=30) parser.add_argument('--sensor_range', type=str, default='0.2') parser.add_argument('--food_reward', type=float, default=5) parser.add_argument('--poison_reward', type=float, default=-1) parser.add_argument('--encounter_reward', type=float, default=0.05) parser.add_argument('--reward_mech', type=str, default='local') parser.add_argument('--recurrent', type=str, default=None) parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baseline_hidden_sizes', type=str, default='128,128') parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help= 'Whether to only print the tabular log information (in a horizontal format)' ) args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) centralized = True if args.control == 'centralized' else False sensor_range = np.array(map(float, args.sensor_range.split(','))) if len(sensor_range) == 1: sensor_range = sensor_range[0] else: assert sensor_range.shape == (args.n_pursuers, ) env = MAWaterWorld(args.n_pursuers, args.n_evaders, args.n_coop, args.n_poison, radius=args.radius, n_sensors=args.n_sensors, food_reward=args.food_reward, poison_reward=args.poison_reward, encounter_reward=args.encounter_reward, reward_mech=args.reward_mech, sensor_range=sensor_range, obstacle_loc=None) env = TfEnv( RLLabEnv(StandardizedEnv(env, scale_reward=args.reward_scale, enable_obsnorm=args.enable_obsnorm), mode=args.control)) if args.buffer_size > 1: env = ObservationBuffer(env, args.buffer_size) if args.recurrent: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=16, hidden_sizes=(128, 64, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) if args.recurrent == 'gru': policy = GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes), name='policy') elif args.recurrent == 'lstm': policy = GaussianLSTMPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes), name='policy') else: policy = GaussianMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=tuple(map(int, args.policy_hidden_sizes.split(','))), min_std=10e-5) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif args.baseline_type == 'mlp': raise NotImplementedError() # baseline = GaussianMLPBaseline( # env_spec=env.spec, hidden_sizes=tuple(map(int, args.baseline_hidden_sizes.split(',')))) else: baseline = ZeroBaseline(env_spec=env.spec) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, #max_path_length_limit=args.max_path_length_limit, update_max_path_length=args.update_curriculum, anneal_step_size=args.anneal_step_size, n_itr=args.n_iter, discount=args.discount, gae_lambda=args.gae_lambda, step_size=args.max_kl, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) if args.recurrent else None, mode=args.control if not args.chunked else 'chunk_{}'.format(args.control), ) algo.train()
def __init__( self, name, input_shape, output_dim, prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, optimizer=None, tr_optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, no_initial_trust_region=True, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) with tf.variable_scope(name): if optimizer is None: optimizer = LbfgsOptimizer(name="optimizer") if tr_optimizer is None: tr_optimizer = ConjugateGradientOptimizer() self.input_dim = input_shape[0] self.observation_space = Discrete(self.input_dim) self.action_space = Discrete(output_dim) self.output_dim = output_dim self.optimizer = optimizer self.tr_optimizer = tr_optimizer if prob_network is None: prob_network = MLP( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, name="prob_network" ) l_prob = prob_network.output_layer LayersPowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys") old_prob_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="old_prob") x_mean_var = tf.get_variable( name="x_mean", shape=(1,) + input_shape, initializer=tf.constant_initializer(0., dtype=tf.float32) ) x_std_var = tf.get_variable( name="x_std", shape=(1,) + input_shape, initializer=tf.constant_initializer(1., dtype=tf.float32) ) self.x_mean_var = x_mean_var self.x_std_var = x_std_var normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical(output_dim) mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars)) loss = - tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = tensor_utils.to_onehot_sym(tf.argmax(prob_var, axis=1), output_dim) self.prob_network = prob_network self.f_predict = tensor_utils.compile_function([xs_var], predicted) self.f_prob = tensor_utils.compile_function([xs_var], prob_var) self.l_prob = l_prob self.optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var]) self.tr_optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var, old_prob_var], leq_constraint=(mean_kl, step_size) ) self.use_trust_region = use_trust_region self.name = name self.normalize_inputs = normalize_inputs self.x_mean_var = x_mean_var self.x_std_var = x_std_var self.first_optimized = not no_initial_trust_region
def __init__( self, name, env_spec, num_ensembles=5, num_models_per_ensemble=3, hidden_sizes=(512, 512), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None, batch_size=500, step_size=0.001, weight_normalization=False, normalize_input=True, ): Serializable.quick_init(self, locals()) self.normalization = None self.normalize_input = normalize_input self.batch_size = batch_size self.step_size = step_size self.num_ensembles = num_ensembles self.num_models_per_ensemble = num_models_per_ensemble self.num_models = num_ensembles * num_models_per_ensemble # determine dimensionality of state and action space self.obs_space_dims = obs_space_dims = env_spec.observation_space.shape[ 0] self.action_space_dims = action_space_dims = env_spec.action_space.shape[ 0] # set model - ensemble assignment self.model_ensemble_assignment = [ list( range(i * self.num_models_per_ensemble, (i + 1) * self.num_models_per_ensemble)) for i in range(self.num_ensembles) ] with tf.variable_scope(name): # placeholders self.obs_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) self.act_ph = tf.placeholder(tf.float32, shape=(None, action_space_dims)) self.delta_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) # concatenate action and observation --> NN input self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1) # create MLP mlps = [] delta_preds = [] self.obs_next_pred = [] for i in range(self.num_models): with tf.variable_scope('model_{}'.format(i)): mlp = MLP(name, obs_space_dims, hidden_sizes, hidden_nonlinearity, output_nonlinearity, input_var=self.nn_input, input_shape=(obs_space_dims + action_space_dims, ), weight_normalization=weight_normalization) mlps.append(mlp) delta_preds.append(mlp.output) self.delta_pred = tf.stack( delta_preds, axis=2) # shape: (batch_size, ndim_obs, n_models) # define loss and train_op self.loss = tf.reduce_mean( (self.delta_ph[:, :, None] - self.delta_pred)**2) self.optimizer = tf.train.AdamOptimizer(self.step_size) self.train_op = self.optimizer.minimize(self.loss) # tensor_utils self.f_delta_pred = tensor_utils.compile_function( [self.obs_ph, self.act_ph], self.delta_pred) LayersPowered.__init__(self, [mlp.output_layer for mlp in mlps])
def __init__(self, name, env_spec, hidden_sizes=(500, 500), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None, batch_size=500, step_size=0.001, weight_normalization=True, normalize_input=True, optimizer=tf.train.AdamOptimizer, valid_split_ratio=0.2, rolling_average_persitency=0.99): Serializable.quick_init(self, locals()) self.normalization = None self.normalize_input = normalize_input self.valid_split_ratio = valid_split_ratio self.rolling_average_persitency = rolling_average_persitency with tf.variable_scope(name): self.batch_size = batch_size self.step_size = step_size # determine dimensionality of state and action space self.obs_space_dims = env_spec.observation_space.shape[0] self.action_space_dims = env_spec.action_space.shape[0] # placeholders self.obs_ph = tf.placeholder(tf.float32, shape=(None, self.obs_space_dims)) self.act_ph = tf.placeholder(tf.float32, shape=(None, self.action_space_dims)) self.delta_ph = tf.placeholder(tf.float32, shape=(None, self.obs_space_dims)) # concatenate action and observation --> NN input self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1) # create MLP mlp = MLP(name, self.obs_space_dims, hidden_sizes, hidden_nonlinearity, output_nonlinearity, input_var=self.nn_input, input_shape=(self.obs_space_dims + self.action_space_dims, ), weight_normalization=weight_normalization) self.delta_pred = mlp.output # define loss and train_op self.loss = tf.reduce_mean((self.delta_ph - self.delta_pred)**2) self.optimizer = optimizer(self.step_size) self.train_op = self.optimizer.minimize(self.loss) # tensor_utils self.f_delta_pred = tensor_utils.compile_function( [self.obs_ph, self.act_ph], self.delta_pred) LayersPowered.__init__(self, [mlp.output_layer])
def init_opt(self): # First, create "target" policy and Q functions with tf.variable_scope("target_policy"): target_policy = Serializable.clone(self.policy) with tf.variable_scope("target_qf"): target_qf = Serializable.clone(self.qf) # y need to be computed first obs = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) # The yi values are computed separately as above and then passed to # the training functions below action = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) yvar = tensor_utils.new_tensor( 'ys', ndim=1, dtype=tf.float32, ) obs_offpolicy = self.env.observation_space.new_tensor_variable( 'obs_offpolicy', extra_dims=1, ) action_offpolicy = self.env.action_space.new_tensor_variable( 'action_offpolicy', extra_dims=1, ) yvar = tensor_utils.new_tensor( 'ys', ndim=1, dtype=tf.float32, ) yvar_offpolicy = tensor_utils.new_tensor( 'ys_offpolicy', ndim=1, dtype=tf.float32, ) qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ sum([tf.reduce_sum(tf.square(param)) for param in self.qf.get_params(regularizable=True)]) qval = self.qf.get_qval_sym(obs, action) qval_off = self.qf.get_qval_sym(obs_offpolicy, action_offpolicy) qf_loss = tf.reduce_mean(tf.square(yvar - qval)) qf_loss_off = tf.reduce_mean(tf.square(yvar_offpolicy - qval_off)) # TODO: penalize dramatic changes in gating_func # if PENALIZE_GATING_DISTRIBUTION_DIVERGENCE: policy_weight_decay_term = 0.5 * self.policy_weight_decay * \ sum([tf.reduce_sum(tf.square(param)) for param in self.policy.get_params(regularizable=True)]) policy_qval = self.qf.get_qval_sym(obs, self.policy.get_action_sym(obs), deterministic=True) policy_qval_off = self.qf.get_qval_sym( obs_offpolicy, self.policy.get_action_sym(obs_offpolicy), deterministic=True) policy_surr = -tf.reduce_mean(policy_qval) policy_surr_off = -tf.reduce_mean(policy_qval_off) if self.sigma_type == 'unified-gated' or self.sigma_type == 'unified-gated-decaying': print("Using Gated Sigma!") input_to_gates = tf.concat([obs, obs_offpolicy], axis=1) assert input_to_gates.get_shape().as_list()[-1] == obs.get_shape( ).as_list()[-1] + obs_offpolicy.get_shape().as_list()[-1] # TODO: right now this is a soft-gate, should make a hard-gate (options vs mixtures) gating_func = MLP( name="sigma_gate", output_dim=1, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.sigmoid, input_var=input_to_gates, input_shape=tuple( input_to_gates.get_shape().as_list()[1:])).output elif self.sigma_type == 'unified': # sample a bernoulli random variable print("Using Bernoulli sigma!") gating_func = tf.cast(self.random_dist.sample(qf_loss.get_shape()), tf.float32) elif self.sigma_type == 'unified-decaying': print("Using decaying sigma!") gating_func = tf.train.exponential_decay(1.0, self.train_step, 20, 0.96, staircase=True) else: raise Exception("sigma type not supported") qf_inputs_list = [ yvar, obs, action, yvar_offpolicy, obs_offpolicy, action_offpolicy, self.train_step ] qf_reg_loss = qf_loss * (1.0 - gating_func) + qf_loss_off * ( gating_func) + qf_weight_decay_term policy_input_list = [obs, obs_offpolicy, self.train_step] policy_reg_surr = policy_surr * ( 1.0 - gating_func) + policy_surr_off * ( gating_func) + policy_weight_decay_term if self.sigma_type == 'unified-gated-decaying': print("Adding a decaying factor to gated sigma!") decaying_factor = tf.train.exponential_decay(.5, self.train_step, 20, 0.96, staircase=True) penalty = decaying_factor * tf.nn.l2_loss(gating_func) qf_reg_loss += penalty policy_reg_surr += penalty self.qf_update_method.update_opt(qf_reg_loss, target=self.qf, inputs=qf_inputs_list) self.policy_update_method.update_opt(policy_reg_surr, target=self.policy, inputs=policy_input_list) f_train_qf = tensor_utils.compile_function( inputs=qf_inputs_list, outputs=[qf_loss, qval, self.qf_update_method._train_op], ) f_train_policy = tensor_utils.compile_function( inputs=policy_input_list, outputs=[policy_surr, self.policy_update_method._train_op], ) self.opt_info = dict( f_train_qf=f_train_qf, f_train_policy=f_train_policy, target_qf=target_qf, target_policy=target_policy, )
def __init__( self, name, input_shape, output_dim, network=None, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, optimizer=None, normalize_inputs=True, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. """ Serializable.quick_init(self, locals()) with tf.variable_scope(name): if optimizer is None: optimizer = LbfgsOptimizer(name="optimizer") self.output_dim = output_dim self.optimizer = optimizer if network is None: network = MLP(input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="network") l_out = network.output_layer LayersPowered.__init__(self, [l_out]) xs_var = network.input_layer.input_var ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys") x_mean_var = tf.get_variable(name="x_mean", shape=(1, ) + input_shape, initializer=tf.constant_initializer( 0., dtype=tf.float32)) x_std_var = tf.get_variable(name="x_std", shape=(1, ) + input_shape, initializer=tf.constant_initializer( 1., dtype=tf.float32)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var fit_ys_var = L.get_output(l_out, {network.input_layer: normalized_xs_var}) loss = -tf.reduce_mean(tf.square(fit_ys_var - ys_var)) self.f_predict = tensor_utils.compile_function([xs_var], fit_ys_var) optimizer_args = dict( loss=loss, target=self, network_outputs=[fit_ys_var], ) optimizer_args["inputs"] = [xs_var, ys_var] self.optimizer.update_opt(**optimizer_args) self.name = name self.l_out = l_out self.normalize_inputs = normalize_inputs self.x_mean_var = x_mean_var self.x_std_var = x_std_var
def __init__( self, input_shape, output_dim, name, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.relu, optimizer=None, tr_optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, no_initial_trust_region=True, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) with tf.variable_scope(name): if optimizer is None: optimizer = LbfgsOptimizer(name="optimizer") if tr_optimizer is None: tr_optimizer = ConjugateGradientOptimizer() self.output_dim = output_dim self.optimizer = optimizer self.tr_optimizer = tr_optimizer p_network = MLP(input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.sigmoid, name="p_network") l_p = p_network.output_layer LayersPowered.__init__(self, [l_p]) xs_var = p_network.input_layer.input_var ys_var = tf.placeholder(dtype=tf.float32, shape=(None, output_dim), name="ys") old_p_var = tf.placeholder(dtype=tf.float32, shape=(None, output_dim), name="old_p") x_mean_var = tf.get_variable(name="x_mean", initializer=tf.zeros_initializer, shape=(1, ) + input_shape) x_std_var = tf.get_variable(name="x_std", initializer=tf.ones_initializer, shape=(1, ) + input_shape) normalized_xs_var = (xs_var - x_mean_var) / x_std_var p_var = L.get_output(l_p, {p_network.input_layer: normalized_xs_var}) old_info_vars = dict(p=old_p_var) info_vars = dict(p=p_var) dist = self._dist = Bernoulli(output_dim) mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars)) loss = -tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = p_var >= 0.5 self.f_predict = tensor_utils.compile_function([xs_var], predicted) self.f_p = tensor_utils.compile_function([xs_var], p_var) self.l_p = l_p self.optimizer.update_opt(loss=loss, target=self, network_outputs=[p_var], inputs=[xs_var, ys_var]) self.tr_optimizer.update_opt(loss=loss, target=self, network_outputs=[p_var], inputs=[xs_var, ys_var, old_p_var], leq_constraint=(mean_kl, step_size)) self.use_trust_region = use_trust_region self.name = name self.normalize_inputs = normalize_inputs self.x_mean_var = x_mean_var self.x_std_var = x_std_var self.first_optimized = not no_initial_trust_region
def parse_env_args(self, env, args): if isinstance(args, dict): args = to_named_tuple(args) # Multi-agent wrapper env = RLLabEnv(env, ma_mode=args.control) env = MATfEnv(env) # Policy if args.recurrent: if args.feature_net: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=args.feature_output, hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) elif args.conv: strides = tuple(args.conv_strides) chans = tuple(args.conv_channels) filts = tuple(args.conv_filters) assert len(strides) == len(chans) == len( filts), "strides, chans and filts not equal" # only discrete actions supported, should be straightforward to extend to continuous assert isinstance( env.spec.action_space, Discrete), "Only discrete action spaces support conv" feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=args.feature_output, conv_filters=chans, conv_filter_sizes=filts, conv_strides=strides, conv_pads=('VALID', ) * len(chans), hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None) else: feature_network = None if args.recurrent == 'gru': if isinstance(env.spec.action_space, Box): if args.control == 'concurrent': policies = [ GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden[0]), name='policy_{}'.format(agid)) for agid in range(len(env.agents)) ] policy = GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden[0]), name='policy') elif isinstance(env.spec.action_space, Discrete): if args.control == 'concurrent': policies = [ CategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), name='policy_{}'.format(agid), state_include_action=False if args.conv else True) for agid in range(len(env.agents)) ] q_network = CategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), name='q_network', state_include_action=False if args.conv else True) target_q_network = CategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), name='target_q_network', state_include_action=False if args.conv else True) policy = { 'q_network': q_network, 'target_q_network': target_q_network } else: raise NotImplementedError(env.spec.observation_space) elif args.recurrent == 'lstm': if isinstance(env.spec.action_space, Box): if args.control == 'concurrent': policies = [ GaussianLSTMPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden), name='policy_{}'.format(agid)) for agid in range(len(env.agents)) ] policy = GaussianLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='policy') elif isinstance(env.spec.action_space, Discrete): if args.control == 'concurrent': policies = [ CategoricalLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='policy_{}'.format(agid)) for agid in range(len(env.agents)) ] q_network = CategoricalLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='q_network') target_q_network = CategoricalLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='target_q_network') policy = { 'q_network': q_network, 'target_q_network': target_q_network } else: raise NotImplementedError(env.spec.action_space) else: raise NotImplementedError(args.recurrent) elif args.conv: strides = tuple(args.conv_strides) chans = tuple(args.conv_channels) filts = tuple(args.conv_filters) assert len(strides) == len(chans) == len( filts), "strides, chans and filts not equal" # only discrete actions supported, should be straightforward to extend to continuous assert isinstance( env.spec.action_space, Discrete), "Only discrete action spaces support conv" feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=env.spec.action_space.n, conv_filters=chans, conv_filter_sizes=filts, conv_strides=strides, conv_pads=(args.conv_pads, ) * len(chans), hidden_sizes=tuple(args.policy_hidden), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax, batch_normalization=args.batch_normalization) if args.algo == 'dqn': q_network = CategoricalMLPPolicy(name='q_network', env_spec=env.spec, prob_network=feature_network) target_q_network = CategoricalMLPPolicy( name='target_q_network', env_spec=env.spec, prob_network=feature_network) policy = { 'q_network': q_network, 'target_q_network': target_q_network } else: policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, prob_network=feature_network) else: if env.spec is None: networks = [ DQNNetwork(i, env, target_network_update_freq=self.args. target_network_update, discount_factor=self.args.discount, batch_size=self.args.batch_size, learning_rate=self.args.qfunc_lr) for i in range(env.n) ] policy = networks elif isinstance(env.spec.action_space, Box): policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), min_std=args.min_std, name='policy') elif isinstance(env.spec.action_space, Discrete): policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), name='policy') else: raise NotImplementedError(env.spec.action_space) return env, policy
def make_network(self, dim_input, dim_output, nn_input=None, target=None, hidden_sizes=(50, )): """ An example a network in tf that has both state and image inputs. Args: dim_input: Dimensionality of input. expecting 2d tuple (num_frames x num_batches) dim_output: Dimensionality of the output. batch_size: Batch size. network_config: dictionary of network structure parameters Returns: A tfMap object that stores inputs, outputs, and scalar loss. """ if nn_input is None: nn_input = tf.placeholder('float', [None, dim_input[0], dim_input[1]], name='nn_input') if target is None: target = tf.placeholder('float', [None, dim_output], name='targets') l_in = L.InputLayer(shape=(None, ) + tuple(dim_input), input_var=nn_input, name="input") prob_network = MLP(output_dim=dim_output, hidden_sizes=hidden_sizes, hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None, name="pred_network", input_layer=l_in) fc_output = L.get_output(prob_network.output_layer) loss, optimizer = self.get_loss_layer(pred=fc_output, target_output=target) self.class_target = target self.nn_input = nn_input self.discrimination_logits = fc_output self.optimizer = optimizer self.loss = loss label_accuracy = tf.equal( tf.round(tf.nn.sigmoid(self.discrimination_logits)), tf.round(self.class_target)) self.label_accuracy = tf.reduce_mean( tf.cast(label_accuracy, tf.float32)) self.mse = tf.reduce_mean( tf.nn.l2_loss( tf.nn.sigmoid(self.discrimination_logits) - self.class_target)) ones = tf.ones_like(self.class_target) true_positives = tf.round(tf.nn.sigmoid( self.discrimination_logits)) * tf.round(self.class_target) predicted_positives = tf.round( tf.nn.sigmoid(self.discrimination_logits)) false_negatives = tf.logical_not( tf.logical_xor( tf.equal(tf.round(tf.nn.sigmoid(self.discrimination_logits)), ones), tf.equal(tf.round(self.class_target), ones))) self.label_precision = tf.reduce_sum( tf.cast(true_positives, tf.float32)) / tf.reduce_sum( tf.cast(predicted_positives, tf.float32)) self.label_recall = tf.reduce_sum(tf.cast( true_positives, tf.float32)) / ( tf.reduce_sum(tf.cast(true_positives, tf.float32)) + tf.reduce_sum(tf.cast(false_negatives, tf.float32)))
def __init__( self, name, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=tf.nn.tanh, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, mean_network=None, std_network=None, std_parametrization='exp' ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :param std_parametrization: how the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) with tf.variable_scope(name): obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network if mean_network is None: mean_network = MLP( name="mean_network", input_shape=(obs_dim,), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) self._mean_network = mean_network l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if std_network is not None: l_std_param = std_network.output_layer else: if adaptive_std: std_network = MLP( name="std_network", input_shape=(obs_dim,), input_layer=mean_network.input_layer, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, ) l_std_param = std_network.output_layer else: if std_parametrization == 'exp': init_std_param = np.log(init_std) elif std_parametrization == 'softplus': init_std_param = np.log(np.exp(init_std) - 1) else: raise NotImplementedError l_std_param = L.ParamLayer( mean_network.input_layer, num_units=action_dim, param=tf.constant_initializer(init_std_param), name="output_std_param", trainable=learn_std, ) self.std_parametrization = std_parametrization if std_parametrization == 'exp': min_std_param = np.log(min_std) elif std_parametrization == 'softplus': min_std_param = np.log(np.exp(min_std) - 1) else: raise NotImplementedError self.min_std_param = min_std_param # mean_var, log_std_var = L.get_output([l_mean, l_std_param]) # # if self.min_std_param is not None: # log_std_var = tf.maximum(log_std_var, np.log(min_std)) # # self._mean_var, self._log_std_var = mean_var, log_std_var self._l_mean = l_mean self._l_std_param = l_std_param self._dist = DiagonalGaussian(action_dim) LayersPowered.__init__(self, [l_mean, l_std_param]) super(GaussianMLPPolicy, self).__init__(env_spec) dist_info_sym = self.dist_info_sym(mean_network.input_layer.input_var, dict()) mean_var = dist_info_sym["mean"] log_std_var = dist_info_sym["log_std"] self._f_dist = tensor_utils.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], )
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.99) parser.add_argument('--gae_lambda', type=float, default=1.0) parser.add_argument('--reward_scale', type=float, default=1.0) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--rectangle', type=str, default='10,10') parser.add_argument('--map_type', type=str, default='rectangle') parser.add_argument('--n_evaders', type=int, default=5) parser.add_argument('--n_pursuers', type=int, default=2) parser.add_argument('--obs_range', type=int, default=3) parser.add_argument('--n_catch', type=int, default=2) parser.add_argument('--urgency', type=float, default=0.0) parser.add_argument('--pursuit', dest='train_pursuit', action='store_true') parser.add_argument('--evade', dest='train_pursuit', action='store_false') parser.set_defaults(train_pursuit=True) parser.add_argument('--surround', action='store_true', default=False) parser.add_argument('--constraint_window', type=float, default=1.0) parser.add_argument('--sample_maps', action='store_true', default=False) parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy') parser.add_argument('--flatten', action='store_true', default=False) parser.add_argument('--reward_mech', type=str, default='global') parser.add_argument('--catchr', type=float, default=0.1) parser.add_argument('--term_pursuit', type=float, default=5.0) parser.add_argument('--recurrent', type=str, default=None) parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128') parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--conv', action='store_true', default=False) parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--checkpoint', type=str, default=None) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help= 'Whether to only print the tabular log information (in a horizontal format)' ) args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) if args.checkpoint: with tf.Session() as sess: data = joblib.load(args.checkpoint) policy = data['policy'] env = data['env'] else: if args.sample_maps: map_pool = np.load(args.map_file) else: if args.map_type == 'rectangle': env_map = TwoDMaps.rectangle_map( *map(int, args.rectangle.split(','))) elif args.map_type == 'complex': env_map = TwoDMaps.complex_map( *map(int, args.rectangle.split(','))) else: raise NotImplementedError() map_pool = [env_map] env = PursuitEvade(map_pool, n_evaders=args.n_evaders, n_pursuers=args.n_pursuers, obs_range=args.obs_range, n_catch=args.n_catch, train_pursuit=args.train_pursuit, urgency_reward=args.urgency, surround=args.surround, sample_maps=args.sample_maps, constraint_window=args.constraint_window, flatten=args.flatten, reward_mech=args.reward_mech, catchr=args.catchr, term_pursuit=args.term_pursuit) env = TfEnv( RLLabEnv(StandardizedEnv(env, scale_reward=args.reward_scale, enable_obsnorm=False), mode=args.control)) if args.recurrent: if args.conv: feature_network = ConvNetwork( name='feature_net', input_shape=emv.spec.observation_space.shape, output_dim=5, conv_filters=(16, 32, 32), conv_filter_sizes=(3, 3, 3), conv_strides=(1, 1, 1), conv_pads=('VALID', 'VALID', 'VALID'), hidden_sizes=(64, ), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax) else: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=5, hidden_sizes=(256, 128, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) if args.recurrent == 'gru': policy = CategoricalGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes), name='policy') elif args.recurrent == 'lstm': policy = CategoricalLSTMPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes), name='policy') elif args.conv: feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=5, conv_filters=(8, 16), conv_filter_sizes=(3, 3), conv_strides=(2, 1), conv_pads=('VALID', 'VALID'), hidden_sizes=(32, ), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, prob_network=feature_network) else: policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=args.hidden_sizes) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = ZeroBaseline(env_spec=env.spec) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, n_itr=args.n_iter, discount=args.discount, gae_lambda=args.gae_lambda, step_size=args.max_kl, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) if args.recurrent else None, mode=args.control, ) algo.train()