コード例 #1
0
            def encoder(x, nonlin):
                w_init = O.truncated_normal_initializer(stddev=0.02)
                with O.argscope(O.conv2d, O.deconv2d, kernel=4, stride=2, W=w_init),\
                     O.argscope(O.leaky_relu, alpha=0.2):

                    _ = x
                    _ = O.conv2d('conv1', _, 64, nonlin=O.leaky_relu)
                    _ = O.conv2d('conv2', _, 128, nonlin=nonlin, use_bias=False)
                    _ = O.conv2d('conv3', _, 256, nonlin=nonlin, use_bias=False)
                    _ = O.conv2d('conv4', _, 512, nonlin=nonlin, use_bias=False)
                    z = _
                return z
コード例 #2
0
            def generator(z):
                w_init = O.truncated_normal_initializer(stddev=0.02)
                with O.argscope(O.conv2d, O.deconv2d, kernel=4, stride=2, W=w_init),\
                     O.argscope(O.fc, W=w_init):

                    _ = z
                    _ = O.fc('fc1', _, 1024, nonlin=O.bn_relu)
                    _ = O.fc('fc2', _, 128 * 7 * 7, nonlin=O.bn_relu)
                    _ = O.reshape(_, [-1, 7, 7, 128])
                    _ = O.deconv2d('deconv1', _, 64, nonlin=O.bn_relu)
                    _ = O.deconv2d('deconv2', _, 1)
                    _ = O.sigmoid(_, 'out')
                return _
コード例 #3
0
            def decoder(z):
                w_init = O.truncated_normal_initializer(stddev=0.02)
                with O.argscope(O.conv2d, O.deconv2d, kernel=4, stride=2, W=w_init),\
                     O.argscope(O.fc, W=w_init):

                    _ = z
                    _ = O.deconv2d('deconv1', _, 256, nonlin=O.bn_relu)
                    _ = O.deconv2d('deconv2', _, 128, nonlin=O.bn_relu)
                    _ = O.deconv2d('deconv3', _, 64, nonlin=O.bn_relu)
                    _ = O.deconv2d('deconv4', _, c)
                    _ = O.sigmoid(_, name='out')
                x = _
                return x
コード例 #4
0
            def discriminator(img):
                w_init = O.truncated_normal_initializer(stddev=0.02)
                with O.argscope(O.conv2d, O.deconv2d, kernel=4, stride=2, W=w_init),\
                     O.argscope(O.fc, W=w_init),\
                     O.argscope(O.leaky_relu, alpha=0.2):

                    _ = img
                    _ = O.conv2d('conv1', _, 64, nonlin=O.leaky_relu)
                    _ = O.conv2d('conv2', _, 128, nonlin=O.bn_nonlin)
                    _ = O.leaky_relu(_)
                    _ = O.fc('fc1', _, 1024, nonlin=O.bn_nonlin)
                    _ = O.leaky_relu(_)
                    _ = O.fc('fct', _, 1)
                return _
コード例 #5
0
            def discriminator(img):
                w_init = O.truncated_normal_initializer(stddev=0.02)
                with O.argscope(O.conv2d, O.deconv2d, kernel=4, stride=2, W=w_init),\
                     O.argscope(O.fc, W=w_init),\
                     O.argscope(O.leaky_relu, alpha=0.2):

                    _ = img
                    _ = O.conv2d('conv1', _, 64, nonlin=O.leaky_relu)
                    _ = O.conv2d('conv2', _, 128, nonlin=O.bn_nonlin)
                    _ = O.leaky_relu(_)
                    _ = O.fc('fc1', _, 1024, nonlin=O.bn_nonlin)
                    _ = O.leaky_relu(_)

                    with env.variable_scope('score'):
                        logits = O.fc('fct', _, 1)

                    with env.variable_scope('code'):
                        _ = O.fc('fc1', _, 128, nonlin=O.bn_nonlin)
                        _ = O.leaky_relu(_)
                        code = O.fc('fc2', _, zc_distrib.param_size)

                return logits, code
コード例 #6
0
def make_network(env):
    with env.create_network() as net:
        net.dist = O.distrib.GaussianDistribution('policy',
                                                  size=get_action_shape()[0],
                                                  fixed_std=False)

        state = O.placeholder('state', shape=(None, ) + get_input_shape())
        batch_size = state.shape[0]

        # We have to define variable scope here for later optimization.

        with env.variable_scope('policy'):
            _ = state

            _ = O.fc('fc1', _, 64, nonlin=O.relu)
            _ = O.fc('fc2', _, 64, nonlin=O.relu)
            mu = O.fc('fc_mu', _, net.dist.sample_size, nonlin=O.tanh)
            logstd = O.variable('logstd',
                                O.truncated_normal_initializer(stddev=0.01),
                                shape=(net.dist.sample_size, ),
                                trainable=True)

            logstd = O.tile(logstd.add_axis(0), [batch_size, 1])
            theta = O.concat([mu, logstd], axis=1)

            policy = net.dist.sample(batch_size=batch_size,
                                     theta=theta,
                                     process_theta=True)
            policy = O.clip_by_value(policy, -1, 1)

            net.add_output(theta, name='theta')
            net.add_output(policy, name='policy')

        if env.phase == env.Phase.TRAIN:
            theta_old = O.placeholder('theta_old',
                                      shape=(None, net.dist.param_size))
            action = O.placeholder('action',
                                   shape=(None, net.dist.sample_size))
            advantage = O.placeholder('advantage', shape=(None, ))
            entropy_beta = O.scalar('entropy_beta', g.entropy_beta)

            log_prob = net.dist.log_likelihood(action,
                                               theta,
                                               process_theta=True)
            log_prob_old = net.dist.log_likelihood(action,
                                                   theta_old,
                                                   process_theta=True)

            ratio = O.exp(log_prob - log_prob_old)
            epsilon = get_env('ppo.epsilon')
            surr1 = ratio * advantage  # surrogate from conservative policy iteration
            surr2 = O.clip_by_value(ratio, 1.0 - epsilon,
                                    1.0 + epsilon) * advantage
            policy_loss = -O.reduce_mean(O.min(
                surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
            entropy = net.dist.entropy(theta, process_theta=True).mean()
            entropy_loss = -entropy_beta * entropy

            net.add_output(policy_loss, name='policy_loss')
            net.add_output(entropy_loss, name='entropy_loss')

            summary.scalar('policy_entropy', entropy)

        with env.variable_scope('value'):
            _ = state
            _ = O.fc('fc1', _, 64, nonlin=O.relu)
            _ = O.fc('fc2', _, 64, nonlin=O.relu)
            value = O.fc('fcv', _, 1)
            value = value.remove_axis(1)
            net.add_output(value, name='value')

        if env.phase == env.Phase.TRAIN:
            value_label = O.placeholder('value_label', shape=(None, ))
            value_old = O.placeholder('value_old', shape=(None, ))

            value_surr1 = O.raw_l2_loss('raw_value_loss_surr1', value,
                                        value_label)
            value_clipped = value_old + O.clip_by_value(
                value - value_old, -epsilon, epsilon)
            value_surr2 = O.raw_l2_loss('raw_value_loss_surr2', value_clipped,
                                        value_label)
            value_loss = O.reduce_mean(O.max(value_surr1, value_surr2))
            net.add_output(value_loss, name='value_loss')

        if env.phase == env.Phase.TRAIN:
            loss = O.identity(policy_loss + entropy_loss + value_loss,
                              name='total_loss')
            net.set_loss(loss)
コード例 #7
0
def make_network(env):
    use_linear_vr = get_env('trpo.use_linear_vr')

    with env.create_network() as net:
        net.dist = O.distrib.GaussianDistribution('policy',
                                                  size=get_action_shape()[0],
                                                  fixed_std=False)
        if use_linear_vr:
            from tartist.app.rl.utils.math import LinearValueRegressor
            net.value_regressor = LinearValueRegressor()

        state = O.placeholder('state', shape=(None, ) + get_input_shape())
        # state = O.moving_average(state)
        # state = O.clip_by_value(state, -10, 10)
        batch_size = state.shape[0]

        # We have to define variable scope here for later optimization.

        with env.variable_scope('policy'):
            _ = state

            with O.argscope(O.fc):
                _ = O.fc('fc1', _, 64, nonlin=O.relu)
                _ = O.fc('fc2', _, 64, nonlin=O.relu)
                mu = O.fc('fc_mu', _, net.dist.sample_size, nonlin=O.tanh)
                logstd = O.variable(
                    'logstd',
                    O.truncated_normal_initializer(stddev=0.01),
                    shape=(net.dist.sample_size, ),
                    trainable=True)

            logstd = O.tile(logstd.add_axis(0), [batch_size, 1])
            theta = O.concat([mu, logstd], axis=1)

            policy = net.dist.sample(batch_size=batch_size,
                                     theta=theta,
                                     process_theta=True)
            policy = O.clip_by_value(policy, -1, 1)

            net.add_output(theta, name='theta')
            net.add_output(policy, name='policy')

        if env.phase == env.Phase.TRAIN:
            theta_old = O.placeholder('theta_old',
                                      shape=(None, net.dist.param_size))
            action = O.placeholder('action',
                                   shape=(None, net.dist.sample_size))
            advantage = O.placeholder('advantage', shape=(None, ))

            log_prob = net.dist.log_likelihood(action,
                                               theta,
                                               process_theta=True)
            log_prob_old = net.dist.log_likelihood(action,
                                                   theta_old,
                                                   process_theta=True)

            # Importance sampling of surrogate loss (L in paper).
            ratio = O.exp(log_prob - log_prob_old)
            policy_loss = -O.reduce_mean(ratio * advantage)

            kl = net.dist.kl(theta_p=theta_old,
                             theta_q=theta,
                             process_theta=True).mean()
            kl_self = net.dist.kl(theta_p=O.zero_grad(theta),
                                  theta_q=theta,
                                  process_theta=True).mean()
            entropy = net.dist.entropy(theta, process_theta=True).mean()

            net.add_output(policy_loss, name='policy_loss')
            net.add_output(kl, name='kl')
            net.add_output(kl_self, name='kl_self')

            summary.scalar('policy_entropy',
                           entropy,
                           collections=[rl.train.ACGraphKeys.POLICY_SUMMARIES])

        if not use_linear_vr:
            with env.variable_scope('value'):
                value = O.fc('fcv', state, 1)
                net.add_output(value, name='value')

            if env.phase == env.Phase.TRAIN:
                value_label = O.placeholder('value_label', shape=(None, ))
                value_loss = O.raw_l2_loss('raw_value_loss', value,
                                           value_label).mean(name='value_loss')
                net.add_output(value_loss, name='value_loss')