def __init__(self,
              sess,
              ob_space,
              ac_space,
              n_env,
              n_steps,
              n_batch,
              reuse=False,
              scale=False,
              observation_input_fc=observation_input):
     super(ActorCriticPolicy,
           self).__init__(sess,
                          ob_space,
                          ac_space,
                          n_env,
                          n_steps,
                          n_batch,
                          reuse=reuse,
                          scale=scale,
                          observation_input_fc=observation_input_fc)
     self._pdtype = make_proba_dist_type(ac_space)
     self._policy = None
     self._proba_distribution = None
     self._value_fn = None
     self._action = None
     self._deterministic_action = None
 def __init__(self,
              sess: tf.Session,
              tasks: list,
              ob_spaces: dict,
              ac_space_dict: dict,
              n_envs_per_task: int,
              n_steps: int,
              reuse=False):
     super(MultiTaskActorCriticPolicy, self).__init__(sess,
                                                      tasks,
                                                      ob_spaces,
                                                      ac_space_dict,
                                                      n_envs_per_task,
                                                      n_steps,
                                                      reuse=reuse)
     self.pdtype_dict = {}
     self.is_discrete_dict = {}
     for task in self.tasks:
         self.pdtype_dict[task] = make_proba_dist_type(
             self.ac_space_dict[task])
         self.is_discrete_dict[task] = isinstance(self.ac_space_dict[task],
                                                  Discrete)
     self.policy_dict = {}
     self.proba_distribution_dict = {}
     self.value_fn_dict = {}
     self.q_value_dict = {}
     self.deterministic_action = None
     self.n_lstm = None
    def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, scale=False, layers=None,
                cnn_extractor=nature_cnn, feature_extraction="cnn", reg_weight=0.0,
                layer_norm=False, act_fun=tf.nn.relu, **kwargs):
        super(ActorCriticPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse,
                                                scale=scale)
        
        self._kwargs_check(feature_extraction, kwargs)
        self.layer_norm = layer_norm
        self.feature_extraction = feature_extraction
        self.cnn_kwargs = kwargs
        self.cnn_extractor = cnn_extractor
        if layers is None:
            layers = [256, 256]
        self.layers = layers
        self.activ_fn = act_fun

        self.qf1 = None
        self.qf2 = None
        self.deterministic_policy = None
        self.act_mu = None
        self.std = None

        self.pdtype = make_proba_dist_type(ac_space)
        self.is_discrete = isinstance(ac_space, Discrete)
        self.policy = None
        self.proba_distribution = None
        self.value_fn = None
        self.deterministic_action = None
        self.initial_state = None

        self.policy_proba = None
Exemple #4
0
 def __init__(self,
              sess,
              ob_space,
              ac_space,
              n_env,
              n_steps,
              n_batch,
              n_lstm=256,
              reuse=False,
              scale=False,
              obs_phs=None):
     # DQN policies need an override for the obs placeholder, due to the architecture of the code
     super(DQNPolicy, self).__init__(sess,
                                     ob_space,
                                     ac_space,
                                     n_env,
                                     n_steps,
                                     n_batch,
                                     n_lstm=n_lstm,
                                     reuse=reuse,
                                     scale=scale,
                                     obs_phs=obs_phs)
     assert not isinstance(
         ac_space,
         Box), "Error: the action space cannot be of type gym.spaces.Box"
     self.pdtype = make_proba_dist_type(ac_space)
     self.value_fn = None
     self.proba_distribution = None
     self.policy = None
Exemple #5
0
 def __init__(self,
              sess,
              ob_space,
              ac_space,
              n_env,
              n_steps,
              n_batch,
              n_lstm=256,
              reuse=False,
              scale=False):
     super(ActorCriticPolicy, self).__init__(sess,
                                             ob_space,
                                             ac_space,
                                             n_env,
                                             n_steps,
                                             n_batch,
                                             n_lstm=n_lstm,
                                             reuse=reuse,
                                             scale=scale)
     self.pdtype = make_proba_dist_type(ac_space)
     self.is_discrete = isinstance(ac_space, Discrete)
     self.policy = None
     self.proba_distribution = None
     self.value_fn = None
     self.deterministic_action = None
     self.initial_state = None
Exemple #6
0
 def __init__(self,
              sess,
              ob_space,
              ac_space,
              n_env,
              n_steps,
              n_batch,
              n_lstm=256,
              reuse=False,
              scale=False):
     self.n_env = n_env
     self.n_steps = n_steps
     self.obs_ph, self.processed_x = observation_input(ob_space,
                                                       n_batch,
                                                       scale=scale)
     self.masks_ph = tf.placeholder(tf.float32,
                                    [n_batch])  # mask (done t-1)
     self.states_ph = tf.placeholder(tf.float32,
                                     [self.n_env, n_lstm * 2])  # states
     self.pdtype = make_proba_dist_type(ac_space)
     self.sess = sess
     self.reuse = reuse
     self.is_discrete = isinstance(ac_space, Discrete)
     self.policy = None
     self.proba_distribution = None
     self.value_fn = None
     self.ob_space = ob_space
Exemple #7
0
    def __init__(
        self,
        wrappedEnv,
        num_seq,
        neuro_structure,
    ):
        super().__init__(wrappedEnv, num_seq)
        assert type(neuro_structure) is tuple
        self.sess = tf.Session()
        self.neuro_structure = self.parse_neuro_structure(neuro_structure)
        self.partition_table = self.build_action_partion_table()
        a = self.partition_table[-1]
        self.action_space = gym.spaces.Box(low=-3, high=3, shape=(a, ))
        self.last_state = None
        self.step_cnt = 0
        self.replay_buffer = replaybuffer(maxlen=512)

        self._pdtype = make_proba_dist_type(self.action_space)
        self._proba_distribution = None
        self.action_ph = None
        self._policy_proba = None
        self.pg_loss = None
        self.params = None
        self.obs = tf.placeholder(tf.float32, shape=(None, 2))

        self.policy = self.init_network_continuous(self.obs, 'net')
        self.sess.run(tf.global_variables_initializer())
Exemple #8
0
 def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False, action_filter=None): #NKAM
     super(ActorCriticPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse,
                                             scale=scale) 
     self._pdtype = make_proba_dist_type(ac_space, action_filter) #NKAM
     self._policy = None
     self._proba_distribution = None
     self._value_fn = None
     self._action = None
     self._deterministic_action = None
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 n_env,
                 n_steps,
                 reuse,
                 n_lstm=256,
                 scope="policy"):
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        self.n_env = n_env
        self.n_steps = n_steps
        self.n_batch = n_env * n_steps
        self.n_lstm = n_lstm
        self.reuse = reuse
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            # self.ac_pdtype = make_pdtype(ac_space)
            self.ac_pdtype = make_proba_dist_type(ac_space)
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(self.n_env, self.n_steps) +
                                        ob_space.shape,
                                        name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder(
                [self.n_env, self.n_steps], name='ac')
            self.masks_ph = tf.placeholder(tf.float32,
                                           [self.n_env, self.n_steps],
                                           name="masks_ph")  # mask (done t-1)
            self.flat_masks_ph = tf.reshape(self.masks_ph,
                                            [self.n_env * self.n_steps])
            self.states_ph = tf.placeholder(tf.float32,
                                            [self.n_env, n_lstm * 2],
                                            name="states_ph")  # states
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            self.pdparamsize = self.ac_pdtype.param_shape()[0]

            self.sh = tf.shape(self.ph_ob)
            x = flatten_two_dims(self.ph_ob)
            self.flat_features = self.get_features(x, reuse=self.reuse)
            self.features = unflatten_first_dim(self.flat_features, self.sh)
Exemple #10
0
 def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False, box_dist='gaussian', squash=False):
     super(ActorCriticPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse,
                                             scale=scale)
     self.box_dist = box_dist
     self.squash = squash
     self._pdtype = make_proba_dist_type(ac_space, box_dist, squash)
     self._policy = None
     self._proba_distribution = None
     self._value_fn = None
     self._action = None
     self._deterministic_action = None
Exemple #11
0
def make_proba_dist_type(ac_space):
    """
    return an instance of ProbabilityDistributionType for the correct type of action space

    :param ac_space: (Gym Space) the input action space
    :return: (ProbabilityDistributionType) the appropriate instance of a ProbabilityDistributionType
    """
    if isinstance(ac_space, spaces.Box):
        assert len(ac_space.shape) == 1, "Error: the action space must be a vector"
        return DiagGaussianFixedVarProbabilityDistributionType(ac_space.shape[0])
    else:
        return make_proba_dist_type(ac_space)
    def get_obs_and_pdtype(self, ob_space, ac_space):
        """
        Initialize probability distribution and get observation placeholder.

        :param ob_space: (Gym Spaces) the observation space
        :param ac_space: (Gym Spaces) the action space
        """
        self.pdtype = pdtype = make_proba_dist_type(ac_space)

        if self.obs_ph is None:
            self.obs_ph, self.processed_x = observation_input(ob_space)
        else:
            assert self.processed_x is not None

        return self.obs_ph, pdtype
Exemple #13
0
 def __init__(self,
              states,
              actions,
              advantages,
              rewards,
              Entropy_coefficient,
              max_grad_norm,
              vf_coef=0.5,
              lr=0.5 * 1e-3):
     self.states = states
     self.actions = actions
     self.advantages = advantages
     self.rewards = rewards
     self.Entropy_coefficient = Entropy_coefficient
     self.vf_coef = vf_coef
     self.lr = lr
     self.pdtype = make_proba_dist_type(spaces.Discrete(4))
     # self.pdtype = make_pdtype(spaces.Discrete(4))
     self.build_model(max_grad_norm)
Exemple #14
0
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None,
                 act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="mlp", **kwargs):
        super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse,
                                                scale=(feature_extraction == "cnn"))

        self._pdtype = make_proba_dist_type(ac_space)

        self._kwargs_check(feature_extraction, kwargs)

        if layers is not None:
            warnings.warn("Usage of the `layers` parameter is deprecated! Use net_arch instead "
                          "(it has a different semantics though).", DeprecationWarning)
            if net_arch is not None:
                warnings.warn("The new `net_arch` parameter overrides the deprecated `layers` parameter!",
                              DeprecationWarning)

        if net_arch is None:
            if layers is None:
                layers = [64, 64]
            net_arch = [dict(vf=layers, pi=layers)]

        with tf.variable_scope("model", reuse=reuse):
            if feature_extraction == "cnn":
                pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs)
            else:
                pi_latent, vf_latent = mlp_extractor(tf.layers.flatten(self.processed_obs), net_arch, act_fun)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent,
                                                           pi_init_scale=1.0, pi_init_bias=0.0, pi_init_std=0.125,
                                                           vf_init_scale=1.0, vf_init_bias=0.0)

        self._setup_init()
        return
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 layers=None,
                 net_arch=None,
                 act_fun=tf.tanh,
                 cnn_extractor=nature_cnn,
                 feature_extraction="cnn",
                 **kwargs):
        super(POMEPolicy, self).__init__(sess,
                                         ob_space,
                                         ac_space,
                                         n_env,
                                         n_steps,
                                         n_batch,
                                         reuse=reuse,
                                         scale=(feature_extraction == "cnn"))

        self._policy = None
        self.n_actions = ac_space.n
        self._kwargs_check(feature_extraction, kwargs)

        if layers is not None:
            warnings.warn(
                "Usage of the `layers` parameter is deprecated! Use net_arch instead "
                "(it has a different semantics though).", DeprecationWarning)
            if net_arch is not None:
                warnings.warn(
                    "The new `net_arch` parameter overrides the deprecated `layers` parameter!",
                    DeprecationWarning)

        def a3c_cnn(scaled_images, **kwargs):
            """
            CNN from Nature paper.

            :param scaled_images: (TensorFlow Tensor) Image input placeholder
            :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN
            :return: (TensorFlow Tensor) The CNN output layer
            """
            activ = tf.nn.relu
            layer_1 = activ(
                conv(scaled_images,
                     'c1',
                     n_filters=16,
                     filter_size=8,
                     stride=4,
                     init_scale=np.sqrt(2),
                     **kwargs))
            layer_2 = activ(
                conv(layer_1,
                     'c2',
                     n_filters=32,
                     filter_size=4,
                     stride=2,
                     init_scale=np.sqrt(2),
                     **kwargs))
            layer_3 = conv_to_fc(layer_2)
            return activ(
                linear(layer_3, 'fc1', n_hidden=256, init_scale=np.sqrt(2)))

        def dynamics(scaled_images, action, **kwargs):
            """
            Dynamic function
            :param scaled_images: (TensorFlow Tensor) Image input placeholder
            :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN
            :return: (TensorFlow Tensor) The CNN output layer
            """
            activ = tf.nn.relu
            layer_1 = activ(
                conv(scaled_images,
                     'c3',
                     n_filters=16,
                     filter_size=8,
                     stride=4,
                     init_scale=np.sqrt(2),
                     **kwargs))
            layer_2 = activ(
                conv(layer_1,
                     'c4',
                     n_filters=32,
                     filter_size=4,
                     stride=2,
                     init_scale=np.sqrt(2),
                     **kwargs))
            layer_3 = conv_to_fc(layer_2)
            layer_4 = tf.concat(values=[action, layer_3], axis=-1)
            return tf.nn.sigmoid(
                linear(layer_4, 'fc2', n_hidden=256, init_scale=np.sqrt(2)))

        with tf.variable_scope("model", reuse=reuse):
            pi_latent = vf_latent = a3c_cnn(self.processed_obs, **kwargs)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._reward_fn = linear(vf_latent, 'rf', self.n_actions)
            self._next_state_fn = linear(vf_latent, 'tf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()

        self._pdtype = make_proba_dist_type(ac_space)
Exemple #16
0
    def __init__(self, cfg, env, arch_type, graph, sess):

        assert arch_type is 'train' or 'act', 'type should be either "train" or "act"'

        cfg_env = cfg['environment']
        cfg_arch = cfg['architecture']

        if arch_type is 'train':
            self.num_steps = math.floor(cfg_env['max_time'] /
                                        cfg_env['control_dt'])
        else:
            self.num_steps = 1

        self.observation_space = env.observation_space
        self.action_space = env.action_space
        self.pdtype = make_proba_dist_type(self.action_space)
        self.n_env = cfg["environment"]["num_envs"]
        self.graph = graph

        with self.graph.as_default():
            with tf.variable_scope("model", reuse=tf.AUTO_REUSE):

                batch_size = self.num_steps * self.n_env

                if arch_type is 'train':
                    batch_size /= cfg["algorithm"]["minibatch"]

                self.obs_ph, self.processed_obs = observation_input(
                    self.observation_space, batch_size, scale=False)

                act_fun = tf.nn.relu

                pi_latent = self.obs_ph
                vi_latent = self.obs_ph

                for idx, dec_layer_size in enumerate(cfg_arch["pi_net"]):
                    pi_latent = act_fun(
                        linear(pi_latent,
                               "pi_net_fc{}".format(idx),
                               dec_layer_size,
                               init_scale=np.sqrt(2)))

                for idx, dec_layer_size in enumerate(cfg_arch["vi_net"]):
                    vi_latent = act_fun(
                        linear(vi_latent,
                               "vi_net_fc{}".format(idx),
                               dec_layer_size,
                               init_scale=np.sqrt(2)))

                self.value_fn = linear(vi_latent, 'vf', 1)
                self.value = self.value_fn[:, 0]
                self.proba_distribution, self.policy, self.q_value = \
                    self.pdtype.proba_distribution_from_latent(pi_latent, vi_latent, init_scale=0.01)
                self.action_ph = self.pdtype.sample_placeholder(
                    [None], name="action_ph")
                self.masks_ph = tf.placeholder(tf.float32, [None], "masks_ph")
                self.action = self.proba_distribution.sample()
                self.neglogp = self.proba_distribution.neglogp(self.action)

        self.initial_state = None
        self.sess = sess

        # continuous action diagonal covariance
        self.policy_proba = [
            self.proba_distribution.mean, self.proba_distribution.std
        ]