def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False, observation_input_fc=observation_input): super(ActorCriticPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale, observation_input_fc=observation_input_fc) self._pdtype = make_proba_dist_type(ac_space) self._policy = None self._proba_distribution = None self._value_fn = None self._action = None self._deterministic_action = None
def __init__(self, sess: tf.Session, tasks: list, ob_spaces: dict, ac_space_dict: dict, n_envs_per_task: int, n_steps: int, reuse=False): super(MultiTaskActorCriticPolicy, self).__init__(sess, tasks, ob_spaces, ac_space_dict, n_envs_per_task, n_steps, reuse=reuse) self.pdtype_dict = {} self.is_discrete_dict = {} for task in self.tasks: self.pdtype_dict[task] = make_proba_dist_type( self.ac_space_dict[task]) self.is_discrete_dict[task] = isinstance(self.ac_space_dict[task], Discrete) self.policy_dict = {} self.proba_distribution_dict = {} self.value_fn_dict = {} self.q_value_dict = {} self.deterministic_action = None self.n_lstm = None
def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, scale=False, layers=None, cnn_extractor=nature_cnn, feature_extraction="cnn", reg_weight=0.0, layer_norm=False, act_fun=tf.nn.relu, **kwargs): super(ActorCriticPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale) self._kwargs_check(feature_extraction, kwargs) self.layer_norm = layer_norm self.feature_extraction = feature_extraction self.cnn_kwargs = kwargs self.cnn_extractor = cnn_extractor if layers is None: layers = [256, 256] self.layers = layers self.activ_fn = act_fun self.qf1 = None self.qf2 = None self.deterministic_policy = None self.act_mu = None self.std = None self.pdtype = make_proba_dist_type(ac_space) self.is_discrete = isinstance(ac_space, Discrete) self.policy = None self.proba_distribution = None self.value_fn = None self.deterministic_action = None self.initial_state = None self.policy_proba = None
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, scale=False, obs_phs=None): # DQN policies need an override for the obs placeholder, due to the architecture of the code super(DQNPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=n_lstm, reuse=reuse, scale=scale, obs_phs=obs_phs) assert not isinstance( ac_space, Box), "Error: the action space cannot be of type gym.spaces.Box" self.pdtype = make_proba_dist_type(ac_space) self.value_fn = None self.proba_distribution = None self.policy = None
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, scale=False): super(ActorCriticPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=n_lstm, reuse=reuse, scale=scale) self.pdtype = make_proba_dist_type(ac_space) self.is_discrete = isinstance(ac_space, Discrete) self.policy = None self.proba_distribution = None self.value_fn = None self.deterministic_action = None self.initial_state = None
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, scale=False): self.n_env = n_env self.n_steps = n_steps self.obs_ph, self.processed_x = observation_input(ob_space, n_batch, scale=scale) self.masks_ph = tf.placeholder(tf.float32, [n_batch]) # mask (done t-1) self.states_ph = tf.placeholder(tf.float32, [self.n_env, n_lstm * 2]) # states self.pdtype = make_proba_dist_type(ac_space) self.sess = sess self.reuse = reuse self.is_discrete = isinstance(ac_space, Discrete) self.policy = None self.proba_distribution = None self.value_fn = None self.ob_space = ob_space
def __init__( self, wrappedEnv, num_seq, neuro_structure, ): super().__init__(wrappedEnv, num_seq) assert type(neuro_structure) is tuple self.sess = tf.Session() self.neuro_structure = self.parse_neuro_structure(neuro_structure) self.partition_table = self.build_action_partion_table() a = self.partition_table[-1] self.action_space = gym.spaces.Box(low=-3, high=3, shape=(a, )) self.last_state = None self.step_cnt = 0 self.replay_buffer = replaybuffer(maxlen=512) self._pdtype = make_proba_dist_type(self.action_space) self._proba_distribution = None self.action_ph = None self._policy_proba = None self.pg_loss = None self.params = None self.obs = tf.placeholder(tf.float32, shape=(None, 2)) self.policy = self.init_network_continuous(self.obs, 'net') self.sess.run(tf.global_variables_initializer())
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False, action_filter=None): #NKAM super(ActorCriticPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale) self._pdtype = make_proba_dist_type(ac_space, action_filter) #NKAM self._policy = None self._proba_distribution = None self._value_fn = None self._action = None self._deterministic_action = None
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, n_env, n_steps, reuse, n_lstm=256, scope="policy"): if layernormalize: print( "Warning: policy is operating on top of layer-normed features. It might slow down the training." ) self.layernormalize = layernormalize self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std self.n_env = n_env self.n_steps = n_steps self.n_batch = n_env * n_steps self.n_lstm = n_lstm self.reuse = reuse with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space # self.ac_pdtype = make_pdtype(ac_space) self.ac_pdtype = make_proba_dist_type(ac_space) self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(self.n_env, self.n_steps) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder( [self.n_env, self.n_steps], name='ac') self.masks_ph = tf.placeholder(tf.float32, [self.n_env, self.n_steps], name="masks_ph") # mask (done t-1) self.flat_masks_ph = tf.reshape(self.masks_ph, [self.n_env * self.n_steps]) self.states_ph = tf.placeholder(tf.float32, [self.n_env, n_lstm * 2], name="states_ph") # states self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope self.pdparamsize = self.ac_pdtype.param_shape()[0] self.sh = tf.shape(self.ph_ob) x = flatten_two_dims(self.ph_ob) self.flat_features = self.get_features(x, reuse=self.reuse) self.features = unflatten_first_dim(self.flat_features, self.sh)
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False, box_dist='gaussian', squash=False): super(ActorCriticPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale) self.box_dist = box_dist self.squash = squash self._pdtype = make_proba_dist_type(ac_space, box_dist, squash) self._policy = None self._proba_distribution = None self._value_fn = None self._action = None self._deterministic_action = None
def make_proba_dist_type(ac_space): """ return an instance of ProbabilityDistributionType for the correct type of action space :param ac_space: (Gym Space) the input action space :return: (ProbabilityDistributionType) the appropriate instance of a ProbabilityDistributionType """ if isinstance(ac_space, spaces.Box): assert len(ac_space.shape) == 1, "Error: the action space must be a vector" return DiagGaussianFixedVarProbabilityDistributionType(ac_space.shape[0]) else: return make_proba_dist_type(ac_space)
def get_obs_and_pdtype(self, ob_space, ac_space): """ Initialize probability distribution and get observation placeholder. :param ob_space: (Gym Spaces) the observation space :param ac_space: (Gym Spaces) the action space """ self.pdtype = pdtype = make_proba_dist_type(ac_space) if self.obs_ph is None: self.obs_ph, self.processed_x = observation_input(ob_space) else: assert self.processed_x is not None return self.obs_ph, pdtype
def __init__(self, states, actions, advantages, rewards, Entropy_coefficient, max_grad_norm, vf_coef=0.5, lr=0.5 * 1e-3): self.states = states self.actions = actions self.advantages = advantages self.rewards = rewards self.Entropy_coefficient = Entropy_coefficient self.vf_coef = vf_coef self.lr = lr self.pdtype = make_proba_dist_type(spaces.Discrete(4)) # self.pdtype = make_pdtype(spaces.Discrete(4)) self.build_model(max_grad_norm)
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="mlp", **kwargs): super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "cnn")) self._pdtype = make_proba_dist_type(ac_space) self._kwargs_check(feature_extraction, kwargs) if layers is not None: warnings.warn("Usage of the `layers` parameter is deprecated! Use net_arch instead " "(it has a different semantics though).", DeprecationWarning) if net_arch is not None: warnings.warn("The new `net_arch` parameter overrides the deprecated `layers` parameter!", DeprecationWarning) if net_arch is None: if layers is None: layers = [64, 64] net_arch = [dict(vf=layers, pi=layers)] with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs) else: pi_latent, vf_latent = mlp_extractor(tf.layers.flatten(self.processed_obs), net_arch, act_fun) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, pi_init_scale=1.0, pi_init_bias=0.0, pi_init_std=0.125, vf_init_scale=1.0, vf_init_bias=0.0) self._setup_init() return
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="cnn", **kwargs): super(POMEPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "cnn")) self._policy = None self.n_actions = ac_space.n self._kwargs_check(feature_extraction, kwargs) if layers is not None: warnings.warn( "Usage of the `layers` parameter is deprecated! Use net_arch instead " "(it has a different semantics though).", DeprecationWarning) if net_arch is not None: warnings.warn( "The new `net_arch` parameter overrides the deprecated `layers` parameter!", DeprecationWarning) def a3c_cnn(scaled_images, **kwargs): """ CNN from Nature paper. :param scaled_images: (TensorFlow Tensor) Image input placeholder :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN :return: (TensorFlow Tensor) The CNN output layer """ activ = tf.nn.relu layer_1 = activ( conv(scaled_images, 'c1', n_filters=16, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( conv(layer_1, 'c2', n_filters=32, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs)) layer_3 = conv_to_fc(layer_2) return activ( linear(layer_3, 'fc1', n_hidden=256, init_scale=np.sqrt(2))) def dynamics(scaled_images, action, **kwargs): """ Dynamic function :param scaled_images: (TensorFlow Tensor) Image input placeholder :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN :return: (TensorFlow Tensor) The CNN output layer """ activ = tf.nn.relu layer_1 = activ( conv(scaled_images, 'c3', n_filters=16, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( conv(layer_1, 'c4', n_filters=32, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs)) layer_3 = conv_to_fc(layer_2) layer_4 = tf.concat(values=[action, layer_3], axis=-1) return tf.nn.sigmoid( linear(layer_4, 'fc2', n_hidden=256, init_scale=np.sqrt(2))) with tf.variable_scope("model", reuse=reuse): pi_latent = vf_latent = a3c_cnn(self.processed_obs, **kwargs) self._value_fn = linear(vf_latent, 'vf', 1) self._reward_fn = linear(vf_latent, 'rf', self.n_actions) self._next_state_fn = linear(vf_latent, 'tf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init() self._pdtype = make_proba_dist_type(ac_space)
def __init__(self, cfg, env, arch_type, graph, sess): assert arch_type is 'train' or 'act', 'type should be either "train" or "act"' cfg_env = cfg['environment'] cfg_arch = cfg['architecture'] if arch_type is 'train': self.num_steps = math.floor(cfg_env['max_time'] / cfg_env['control_dt']) else: self.num_steps = 1 self.observation_space = env.observation_space self.action_space = env.action_space self.pdtype = make_proba_dist_type(self.action_space) self.n_env = cfg["environment"]["num_envs"] self.graph = graph with self.graph.as_default(): with tf.variable_scope("model", reuse=tf.AUTO_REUSE): batch_size = self.num_steps * self.n_env if arch_type is 'train': batch_size /= cfg["algorithm"]["minibatch"] self.obs_ph, self.processed_obs = observation_input( self.observation_space, batch_size, scale=False) act_fun = tf.nn.relu pi_latent = self.obs_ph vi_latent = self.obs_ph for idx, dec_layer_size in enumerate(cfg_arch["pi_net"]): pi_latent = act_fun( linear(pi_latent, "pi_net_fc{}".format(idx), dec_layer_size, init_scale=np.sqrt(2))) for idx, dec_layer_size in enumerate(cfg_arch["vi_net"]): vi_latent = act_fun( linear(vi_latent, "vi_net_fc{}".format(idx), dec_layer_size, init_scale=np.sqrt(2))) self.value_fn = linear(vi_latent, 'vf', 1) self.value = self.value_fn[:, 0] self.proba_distribution, self.policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vi_latent, init_scale=0.01) self.action_ph = self.pdtype.sample_placeholder( [None], name="action_ph") self.masks_ph = tf.placeholder(tf.float32, [None], "masks_ph") self.action = self.proba_distribution.sample() self.neglogp = self.proba_distribution.neglogp(self.action) self.initial_state = None self.sess = sess # continuous action diagonal covariance self.policy_proba = [ self.proba_distribution.mean, self.proba_distribution.std ]