Beispiel #1
0
    def __init__(self, name: str, shape: List[int], eps=1e-8, verbose=False):  # batch_size x ...
        super().__init__()

        self.name = name
        self.shape = shape
        self.eps = eps
        self._verbose = verbose

        with self.scope:
            self.op_mean = nn.Parameter(tf.zeros(shape, dtype=tf.float32), name='mean', trainable=False)
            self.op_std = nn.Parameter(tf.ones(shape, dtype=tf.float32), name='std', trainable=False)
            self.op_n = nn.Parameter(tf.zeros([], dtype=tf.int64), name='n', trainable=False)
    def __init__(self, nin, nf, rf, stride, padding='VALID', init_scale=1.0):
        super().__init__()
        self.strides = [1, stride, stride, 1]
        self.padding = padding

        w_shape = [rf, rf, nin, nf]
        b_shape = [1, 1, 1, nf]
        self.w = nn.Parameter(ortho_initializer(init_scale)(w_shape,
                                                            np.float32),
                              dtype=tf.float32,
                              name="w")
        self.b = nn.Parameter(tf.constant_initializer(0.0)(b_shape),
                              dtype=tf.float32,
                              name="b")
Beispiel #3
0
    def __init__(self, dim_state: int, dim_action: int, hidden_sizes: List[int], normalizer: GaussianNormalizer,
                 init_std=1.):
        super().__init__()
        self.dim_state = dim_state
        self.dim_action = dim_action
        self.hidden_sizes = hidden_sizes
        self.init_std = init_std
        self.normalizer = normalizer
        with self.scope:
            self.op_states = tf.placeholder(tf.float32, shape=[None, dim_state], name='states')
            self.op_actions_ = tf.placeholder(tf.float32, shape=[None, dim_action], name='actions')

            layers = []
            # note that the placeholder has size 105.
            all_sizes = [dim_state, *self.hidden_sizes]
            for i, (in_features, out_features) in enumerate(zip(all_sizes[:-1], all_sizes[1:])):
                layers.append(nn.Linear(in_features, out_features, weight_initializer=normc_initializer(1)))
                layers.append(nn.Tanh())
            layers.append(nn.Linear(all_sizes[-1], dim_action, weight_initializer=normc_initializer(0.01)))
            self.net = nn.Sequential(*layers)

            self.op_log_std = nn.Parameter(
                tf.constant(np.log(self.init_std), shape=[self.dim_action], dtype=tf.float32), name='log_std')

        self.distribution = self(self.op_states)
        self.op_actions = self.distribution.sample()
        self.op_actions_mean = self.distribution.mean()
        self.op_actions_std = self.distribution.stddev()
        self.op_nlls_ = -self.distribution.log_prob(self.op_actions_).reduce_sum(axis=1)
Beispiel #4
0
    def __init__(self, qfns: List[lz.rl.BaseNNQFunction], policy: lz.rl.BaseNNPolicy, dim_state: int, dim_action: int,
                 *, alpha):
        super().__init__()
        self.qfns = qfns
        self.qfns_target = [qfn.copy() for qfn in qfns]
        self.policy = policy
        self.dim_action = dim_action

        with self.scope:
            self.op_states = tf.placeholder(tf.float32, [None, dim_state])
            self.op_actions = tf.placeholder(tf.float32, [None, dim_action])
            self.op_rewards = tf.placeholder(tf.float32, [None])
            self.op_next_states = tf.placeholder(tf.float32, [None, dim_state])
            self.op_dones = tf.placeholder(tf.float32, [None])

        if alpha:
            self.auto_entropy = False
            self.op_alpha = tf.constant(alpha, dtype=tf.float32)
        else:
            self.auto_entropy = True
            self.log_alpha = nn.Parameter(0.0, name='alpha', dtype=tf.float32)
            self.op_alpha = tf.exp(self.log_alpha)

        self.op_qfn_losses, self.op_train_qfn = self.train_qfn(
            self.op_states, self.op_actions, self.op_rewards, self.op_next_states, self.op_dones)
        with tf.control_dependencies([self.op_train_qfn]):
            self.op_train_policy, self.op_train_alpha = self.train_policy(self.op_states)
        self.op_update_targets = self.update_targets()

        self._n_updates = 0
Beispiel #5
0
 def __init__(self, x, n_total_blocks):
     super().__init__()
     std = np.sqrt(2. / x / n_total_blocks)
     self.bias1a = nn.Parameter(tf.zeros(1), name='bias1a')
     self.fc1 = nn.Linear(x,
                          x,
                          bias=False,
                          weight_initializer=tf.initializers.random_normal(
                              0, stddev=std))
     self.bias1b = nn.Parameter(tf.zeros(1), name='bias1b')
     self.relu = nn.ReLU()
     self.bias2a = nn.Parameter(tf.zeros(1), name='bias2a')
     self.fc2 = nn.Linear(x,
                          x,
                          bias=False,
                          weight_initializer=tf.initializers.zeros())
     self.scale = nn.Parameter(tf.ones(1), name='scale')
     self.bias2b = nn.Parameter(tf.zeros(1), name='bias2b')
    def __init__(self, dim_state: int, dim_action: int, actor: Actor, critic: Critic, init_alpha: float,
                 gamma: float, target_entropy: float, actor_lr: float, critic_lr: float, alpha_lr: float,
                 tau: float, actor_update_freq: int, target_update_freq: int, learn_alpha: bool):
        super().__init__()

        self.actor = actor
        self.critic = critic
        self.critic_target = self.critic.clone()
        self.gamma = gamma
        self.target_entropy = target_entropy
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.alpha_lr = alpha_lr
        self.tau = tau
        self.actor_update_freq = actor_update_freq
        self.target_update_freq = target_update_freq
        self.learn_alpha = learn_alpha

        with self.scope:
            self.op_states = tf.placeholder(tf.float32, [None, dim_state], 'states')
            self.op_actions = tf.placeholder(tf.float32, [None, dim_action], 'actions')
            self.op_next_states = tf.placeholder(tf.float32, [None, dim_state], 'next_states')
            self.op_rewards = tf.placeholder(tf.float32, [None], 'rewards')
            self.op_terminals = tf.placeholder(tf.float32, [None], 'terminals')
            self.op_tau = tf.placeholder(tf.float32, [], 'tau')

            self.op_log_alpha = nn.Parameter(tf.log(init_alpha), name="log_alpha")

            target_params, source_params = self.critic_target.parameters(), self.critic.parameters()
            self.op_update_critic_target = tf.group(
                *[tf.assign(v_t, self.op_tau * v_t + (1 - self.op_tau) * v_s)
                  for v_t, v_s in zip(target_params, source_params)])

            self.op_actor_loss, self.op_critic_loss, self.op_alpha_loss, self.op_entropy, self.op_q_value, \
                self.op_dist_mean, self.op_dist_std, self.op_a1, self.op_a2, self.op_log_prob_a1 = self(
                        states=self.op_states, actions=self.op_actions, next_states=self.op_next_states,
                        rewards=self.op_rewards, terminals=self.op_terminals, log_alpha=self.op_log_alpha
                    )

            actor_optimizer = tf.train.AdamOptimizer(learning_rate=self.actor_lr)
            critic_optimizer = tf.train.AdamOptimizer(learning_rate=self.critic_lr)
            alpha_optimizer = tf.train.AdamOptimizer(learning_rate=self.alpha_lr)

            self.op_actor_train = actor_optimizer.minimize(self.op_actor_loss, var_list=self.actor.parameters())
            self.op_critic_train = critic_optimizer.minimize(self.op_critic_loss, var_list=self.critic.parameters())
            self.op_alpha_train = alpha_optimizer.minimize(self.op_alpha_loss, var_list=[self.op_log_alpha])

            self.op_actor_norm = tf.global_norm(self.actor.parameters())
            self.op_critic_norm = tf.global_norm(self.critic.parameters())

            self.op_alpha = tf.exp(self.op_log_alpha)
        self.iterations = 0
 def __init__(self, nin, nh, init_scale=1., init_bias=0.):
     super().__init__()
     self.w = nn.Parameter(
         ortho_initializer(init_scale)([nin, nh], np.float32), "w")
     self.b = nn.Parameter(tf.constant_initializer(init_bias)([nh]), "b")
Beispiel #8
0
    def __init__(self,
                 dim_state: int,
                 dim_action: int,
                 hidden_sizes: List[int],
                 normalizers: Normalizers,
                 output_diff=False,
                 init_std=1.):
        super().__init__()
        self.dim_state = dim_state
        self.dim_action = dim_action
        self.hidden_sizes = hidden_sizes
        self.output_diff = output_diff
        self.init_std = init_std
        self.normalizers = normalizers
        with self.scope:
            self.op_states = tf.placeholder(tf.float32,
                                            shape=[None, dim_state],
                                            name='states')
            self.op_actions = tf.placeholder(tf.float32,
                                             shape=[None, dim_action],
                                             name='actions')
            self.op_next_states_ = tf.placeholder(tf.float32,
                                                  shape=[None, dim_state],
                                                  name='next_states')

            layers = []
            all_sizes = [dim_state + dim_action, *self.hidden_sizes]
            for i, (in_features, out_features) in enumerate(
                    zip(all_sizes[:-1], all_sizes[1:])):
                layers.append(FCLayer(in_features, out_features))
                layers.append(nn.Tanh())
            layers.append(FCLayer(all_sizes[-1], dim_state, init_scale=0.01))
            self.net = nn.Sequential(*layers)

            self.op_log_std = nn.Parameter(tf.constant(np.log(self.init_std),
                                                       shape=[self.dim_state],
                                                       dtype=tf.float32),
                                           name='log_std')

            self.distribution = self(self.op_states, self.op_actions)
            self.op_next_states_std = self.distribution.stddev()
            if self.output_diff:
                self.op_next_states_mean = self.op_states + self.normalizers.diff(
                    self.distribution.mean(), inverse=True)
                self.op_next_states = self.op_states + self.normalizers.diff(
                    tf.clip_by_value(
                        self.distribution.sample(),
                        self.distribution.mean() -
                        3 * self.distribution.stddev(),
                        self.distribution.mean() +
                        3 * self.distribution.stddev()),
                    inverse=True)
            else:
                self.op_next_states_mean = self.normalizers.state(
                    self.distribution.mean(), inverse=True)
                self.op_next_states = self.normalizers.state(tf.clip_by_value(
                    self.distribution.sample(),
                    self.distribution.mean() - 3 * self.distribution.stddev(),
                    self.distribution.mean() + 3 * self.distribution.stddev()),
                                                             inverse=True)
            self.op_mse_loss = tf.reduce_mean(
                tf.square(
                    self.normalizers.state(self.op_next_states_) -
                    self.normalizers.state(self.op_next_states_mean), ))