Esempio n. 1
0
def make_network(env, h=None, w=None):
    with env.create_network() as net:
        if h is None:
            img = O.placeholder('img', shape=(1, None, None, 3))
        else:
            img = O.variable('img', np.zeros([1, h, w, 3]))
        net.add_output(img, name='img')

        _ = img
        _ = _ - get_env('neural_style.image_mean').reshape(1, 1, 1, 3)
        _ = O.pad_rb_multiple_of(_, 32)

        def stacked_conv(prefix, nr_convs, in_, channel, kernel=(3, 3), padding='SAME', nonlin=O.relu):
            for i in range(1, nr_convs + 1):
                in_ = O.conv2d('{}_{}'.format(prefix, i), in_, channel, kernel, padding=padding, nonlin=nonlin)
            return in_

        _ = stacked_conv('conv1', 2, _, 64)
        _ = O.pooling2d('pool1', _, (2, 2))
        _ = stacked_conv('conv2', 2, _, 128)
        _ = O.pooling2d('pool2', _, (2, 2))
        _ = stacked_conv('conv3', 3, _, 256)
        _ = O.pooling2d('pool3', _, (2, 2))
        _ = stacked_conv('conv4', 3, _, 512)
        _ = O.pooling2d('pool4', _, (2, 2))
        _ = stacked_conv('conv5', 3, _, 512)
        _ = O.pooling2d('pool5', _, (2, 2))

        for l in get_env('neural_style.content_layers'):
            net.add_output(net.find_var_by_name(l[0] + '/bias'), name=l[0])
        for l in get_env('neural_style.style_layers'):
            net.add_output(net.find_var_by_name(l[0] + '/bias'), name=l[0])
Esempio n. 2
0
def make_param_gs(env, var_list, name_scope):
    var_shapes = [as_tftensor(v).get_shape().as_list() for v in var_list]
    for vs, v in zip(var_shapes, var_list):
        assert None not in vs, 'Could not determine the shape for optimizable variable: {}.'.format(
            v)
    var_nr_elems = [
        as_tftensor(v).get_shape().num_elements() for v in var_list
    ]
    nr_total_elems = sum(var_nr_elems)

    param_nr_elems = nr_total_elems

    with env.name_scope(name_scope):
        # Parameter getter
        param_getter = vectorize_var_list(var_list)

        # Parameter setter
        flat_variables_tensor = O.placeholder('flat_variable_tensor',
                                              shape=(nr_total_elems, ))
        var_assigns = []

        index = 0
        for v, vs, vn in zip(var_list, var_shapes, var_nr_elems):
            value = flat_variables_tensor[index:index + vn].reshape(vs)
            # Use tf.assign because tf.group use non-3rdparty-compatible codes.
            var_assigns.append(
                tf.assign(v, value, name='assign_{}'.format(escape_name(v))))
            index += vn

        param_setter = tf.group(*var_assigns)
        param_provider = as_tftensor(flat_variables_tensor)

    return param_nr_elems, param_getter, param_setter, param_provider
    def testPaddingCenter(self):
        a = O.placeholder('a', shape=(16, 15, 15, 3))
        b = O.pad_center(a, [17, 17])
        self.assertTupleEqual(b.static_shape, (16, 17, 17, 3))

        avar = np.random.normal(size=(16, 15, 15, 3))
        bvar = np.pad(avar, [[0, 0], [1, 1], [1, 1], [0, 0]], mode='constant')
        self.assertTensorClose(b.eval(a=avar), bvar)
    def testPaddingRBMultiple(self):
        a = O.placeholder('a', shape=(16, 15, 15, 3))
        b = O.pad_rb_multiple_of(a, 8)
        self.assertTupleEqual(b.static_shape, (16, 16, 16, 3))

        avar = np.random.normal(size=(16, 15, 15, 3))
        bvar = np.pad(avar, [[0, 0], [0, 1], [0, 1], [0, 0]], mode='constant')
        self.assertTensorClose(b.eval(a=avar), bvar)
    def testCropLU(self):
        a = O.placeholder('a', shape=(16, 17, 17, 3))
        b = O.crop_lu(a, [15, 15])
        self.assertTupleEqual(b.static_shape, (16, 15, 15, 3))

        avar = np.random.normal(size=(16, 17, 17, 3))
        bvar = avar[:, :-2, :-2, :]
        self.assertTensorClose(b.eval(a=avar), bvar)
Esempio n. 6
0
def make_network(env):
    with env.create_network() as net:

        dpc = env.create_dpcontroller()
        with dpc.activate():

            def inputs():
                h, w, c = 28, 28, 1
                img = O.placeholder('img', shape=(None, h, w, c))
                return [img]

            def forward(img):
                _ = img
                _ = O.conv2d('conv1',
                             _,
                             16, (3, 3),
                             padding='SAME',
                             nonlin=O.identity)
                _ = O.batch_norm('bn1', _)
                _ = O.relu(_)
                _ = O.pooling2d('pool1', _, kernel=2)
                _ = O.conv2d('conv2',
                             _,
                             32, (3, 3),
                             padding='SAME',
                             nonlin=O.identity)
                _ = O.batch_norm('bn2', _)
                _ = O.relu(_)
                _ = O.pooling2d('pool2', _, kernel=2)
                dpc.add_output(_, name='feature')

            dpc.set_input_maker(inputs).set_forward_func(forward)

        _ = dpc.outputs['feature']
        _ = O.fc('fc1', _, 64)
        _ = O.fc('fc2', _, 10)

        prob = O.softmax(_, name='prob')
        pred = _.argmax(axis=1).astype('int32', name='pred')
        net.add_output(prob)
        net.add_output(pred)

        if env.phase is env.Phase.TRAIN:
            label = O.placeholder('label', shape=(None, ), dtype='int32')
            loss = O.sparse_softmax_cross_entropy_with_logits(
                logits=_, labels=label).mean()
            loss = O.identity(loss, name='loss')
            net.set_loss(loss)

            accuracy = O.eq(label, pred).astype('float32').mean()
            error = 1. - accuracy

            summary.scalar('accuracy', accuracy)
            summary.scalar('error', error)
            summary.inference.scalar('loss', loss)
            summary.inference.scalar('accuracy', accuracy)
            summary.inference.scalar('error', error)
Esempio n. 7
0
def make_network(env):
    with env.create_network() as net:
        nr_classes = get_env('dataset.nr_classes')

        conv_bn_relu = functools.partial(O.conv2d, nonlin=O.bn_relu)
        conv2d = conv_bn_relu

        dpc = env.create_dpcontroller()
        with dpc.activate():
            def inputs():
                h, w, c = 32, 32, 3
                img = O.placeholder('img', shape=(None, h, w, c))
                return [img]

            def forward(img):
                _ = img
                _ = conv2d('conv1.1', _, 16, (3, 3), padding='SAME')
                _ = conv2d('conv1.2', _, 16, (3, 3), padding='SAME')
                _ = O.pooling2d('pool1', _, kernel=3, stride=2)
                _ = conv2d('conv2.1', _, 32, (3, 3), padding='SAME')
                _ = conv2d('conv2.2', _, 32, (3, 3), padding='SAME')
                _ = O.pooling2d('pool2', _, kernel=3, stride=2)
                _ = conv2d('conv3.1', _, 64, (3, 3), padding='VALID')
                _ = conv2d('conv3.2', _, 64, (3, 3), padding='VALID')
                _ = conv2d('conv3.3', _, 64, (3, 3), padding='VALID')

                dpc.add_output(_, name='feature')

            dpc.set_input_maker(inputs).set_forward_func(forward)

        _ = dpc.outputs['feature']
        _ = O.fc('fc1', _, 128, nonlin=O.relu)
        _ = O.fc('fc2', _, 64, nonlin=O.relu)
        _ = O.fc('linear', _, nr_classes)

        prob = O.softmax(_, name='prob')
        pred = _.argmax(axis=1).astype('int32', name='pred')
        net.add_output(prob)
        net.add_output(pred)

        if env.phase is env.Phase.TRAIN:
            label = O.placeholder('label', shape=(None, ), dtype='int32')
            loss = O.sparse_softmax_cross_entropy_with_logits(logits=_, labels=label).mean()
            loss = O.identity(loss, name='loss')
            net.set_loss(loss)

            accuracy = O.eq(label, pred).astype('float32').mean()
            error = 1. - accuracy

            summary.scalar('accuracy', accuracy)
            summary.scalar('error', error)
            summary.inference.scalar('loss', loss)
            summary.inference.scalar('accuracy', accuracy)
            summary.inference.scalar('error', error)
Esempio n. 8
0
    def testAdvancedIndexing(self):
        a = O.placeholder('a', shape=(5, 5))
        a_val = np.arange(25).reshape((5, 5)).astype('float32')
        feed_dict = {a.name: a_val}

        self.assertTensorClose(a[0:3].eval(feed_dict=feed_dict), a_val[0:3])
        self.assertTensorClose(a[0:3, 0:3].eval(feed_dict=feed_dict),
                               a_val[0:3, 0:3])
        with self.assertRaises(NotImplementedError):
            self.assertTensorClose(a.set_sub[0:3](1).eval(feed_dict=feed_dict),
                                   np.array([1, 1, 1, 3, 4]))
        if True:
            self.assertTensorClose(a.ai[[0, 3]].eval(feed_dict=feed_dict),
                                   a_val[[0, 3]])
            self.assertTensorClose(
                a.ai[[0, 3], [0, 3]].eval(feed_dict=feed_dict), a_val[[0, 3],
                                                                      [0, 3]])
        with self.assertRaises(NotImplementedError):
            self.assertTensorClose(
                a.set_ai[[0, 3]](1).eval(feed_dict=feed_dict),
                np.array([1, 1, 1, 3, 4]))
def make_network(env):
    is_train = env.phase is env.Phase.TRAIN

    with env.create_network() as net:
        h, w, c = get_input_shape()

        dpc = env.create_dpcontroller()
        with dpc.activate():

            def inputs():
                state = O.placeholder('state', shape=(None, h, w, c))
                next_state = O.placeholder('next_state', shape=(None, h, w, c))
                return [state, next_state]

            @O.auto_reuse
            def phi(x):
                _ = x / 255.0

                # Nature structure
                with O.argscope(O.conv2d, nonlin=O.relu):
                    _ = O.conv2d('conv1', _, 32, 8, stride=4)
                    _ = O.conv2d('conv2', _, 64, 4, stride=2)
                    _ = O.conv2d('conv3', _, 64, 3, stride=1)
                return _

            def forward(state, next_state):
                dpc.add_output(phi(state), name='feature')
                dpc.add_output(phi(next_state), name='next_feature')

            dpc.set_input_maker(inputs).set_forward_func(forward)

        @O.auto_reuse
        def phi_fc(feature):
            _ = feature
            _ = O.fc('fc0',
                     _,
                     512,
                     nonlin=functools.partial(O.leaky_relu, alpha=0.01))
            q_pred = O.fc('fcq', _, get_player_nr_actions())
            q_max = q_pred.max(axis=1)
            q_argmax = q_pred.argmax(axis=1)
            return q_pred, q_max, q_argmax

        _ = dpc.outputs['feature']
        q_pred, q_max, q_argmax = phi_fc(_)

        _ = dpc.outputs['next_feature']
        next_q_pred, next_q_max, _ = phi_fc(_)

        net.add_output(q_pred, name='q_pred')
        net.add_output(q_max, name='q_max')
        net.add_output(q_argmax, name='q_argmax')

        if is_train:
            reward = O.placeholder('reward', shape=(None, ), dtype='float32')
            action = O.placeholder('action', shape=(None, ), dtype='int64')
            is_over = O.placeholder('is_over', shape=(None, ), dtype='bool')

            assert get_env('dqn.nr_td_steps') == 1
            this_q_pred = (q_pred *
                           O.one_hot(action, get_player_nr_actions())).sum(
                               axis=1)
            this_q_label = reward + get_env('dqn.gamma') * (
                1 - is_over.astype('float32')) * O.zero_grad(next_q_max)

            summary.scalar('this_q_pred', this_q_pred.mean())
            summary.scalar('this_q_label', this_q_label.mean())
            summary.scalar('reward', reward.mean())
            summary.scalar('is_over', is_over.astype('float32').mean())

            q_loss = O.raw_smooth_l1_loss('raw_q_loss', this_q_pred,
                                          this_q_label).mean(name='q_loss')
            net.set_loss(q_loss)
Esempio n. 10
0
 def inputs():
     img = O.placeholder('img', shape=(None, h, w, c))
     # only for demo-time
     zc = O.placeholder('zc', shape=(1, net.zc_distrib.sample_size))
     return [img, zc]
def make_network(env):
    with env.create_network() as net:
        net.dist = O.distrib.GaussianDistribution('policy',
                                                  size=get_action_shape()[0],
                                                  fixed_std=False)

        state = O.placeholder('state', shape=(None, ) + get_input_shape())
        batch_size = state.shape[0]

        # We have to define variable scope here for later optimization.

        with env.variable_scope('policy'):
            _ = state

            _ = O.fc('fc1', _, 64, nonlin=O.relu)
            _ = O.fc('fc2', _, 64, nonlin=O.relu)
            mu = O.fc('fc_mu', _, net.dist.sample_size, nonlin=O.tanh)
            logstd = O.variable('logstd',
                                O.truncated_normal_initializer(stddev=0.01),
                                shape=(net.dist.sample_size, ),
                                trainable=True)

            logstd = O.tile(logstd.add_axis(0), [batch_size, 1])
            theta = O.concat([mu, logstd], axis=1)

            policy = net.dist.sample(batch_size=batch_size,
                                     theta=theta,
                                     process_theta=True)
            policy = O.clip_by_value(policy, -1, 1)

            net.add_output(theta, name='theta')
            net.add_output(policy, name='policy')

        if env.phase == env.Phase.TRAIN:
            theta_old = O.placeholder('theta_old',
                                      shape=(None, net.dist.param_size))
            action = O.placeholder('action',
                                   shape=(None, net.dist.sample_size))
            advantage = O.placeholder('advantage', shape=(None, ))
            entropy_beta = O.scalar('entropy_beta', g.entropy_beta)

            log_prob = net.dist.log_likelihood(action,
                                               theta,
                                               process_theta=True)
            log_prob_old = net.dist.log_likelihood(action,
                                                   theta_old,
                                                   process_theta=True)

            ratio = O.exp(log_prob - log_prob_old)
            epsilon = get_env('ppo.epsilon')
            surr1 = ratio * advantage  # surrogate from conservative policy iteration
            surr2 = O.clip_by_value(ratio, 1.0 - epsilon,
                                    1.0 + epsilon) * advantage
            policy_loss = -O.reduce_mean(O.min(
                surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
            entropy = net.dist.entropy(theta, process_theta=True).mean()
            entropy_loss = -entropy_beta * entropy

            net.add_output(policy_loss, name='policy_loss')
            net.add_output(entropy_loss, name='entropy_loss')

            summary.scalar('policy_entropy', entropy)

        with env.variable_scope('value'):
            _ = state
            _ = O.fc('fc1', _, 64, nonlin=O.relu)
            _ = O.fc('fc2', _, 64, nonlin=O.relu)
            value = O.fc('fcv', _, 1)
            value = value.remove_axis(1)
            net.add_output(value, name='value')

        if env.phase == env.Phase.TRAIN:
            value_label = O.placeholder('value_label', shape=(None, ))
            value_old = O.placeholder('value_old', shape=(None, ))

            value_surr1 = O.raw_l2_loss('raw_value_loss_surr1', value,
                                        value_label)
            value_clipped = value_old + O.clip_by_value(
                value - value_old, -epsilon, epsilon)
            value_surr2 = O.raw_l2_loss('raw_value_loss_surr2', value_clipped,
                                        value_label)
            value_loss = O.reduce_mean(O.max(value_surr1, value_surr2))
            net.add_output(value_loss, name='value_loss')

        if env.phase == env.Phase.TRAIN:
            loss = O.identity(policy_loss + entropy_loss + value_loss,
                              name='total_loss')
            net.set_loss(loss)
Esempio n. 12
0
def make_network(env):
    is_train = env.phase is env.Phase.TRAIN

    # device control: always use master device only for training session
    if is_train:
        slave_devices = env.slave_devices
        env.set_slave_devices([])
    
    with env.create_network() as net:
        input_length, = get_input_shape()
        action_length, = get_action_shape()

        dpc = env.create_dpcontroller()
        with dpc.activate():
            def inputs():
                state = O.placeholder('state', shape=(None, input_length))
                return [state]

            # forward policy network and value network separately (actor-critic)
            def forward(x):
                _ = x
                _ = O.fc('fcp1', _, 512, nonlin=O.relu)
                _ = O.fc('fcp2', _, 256, nonlin=O.relu)
                dpc.add_output(_, name='feature_p')

                _ = x
                _ = O.fc('fcv1', _, 512, nonlin=O.relu)
                _ = O.fc('fcv2', _, 256, nonlin=O.relu)
                dpc.add_output(_, name='feature_v')

            dpc.set_input_maker(inputs).set_forward_func(forward)

        _ = dpc.outputs['feature_p']
        # mu and std, assuming spherical covariance
        policy_mu = O.fc('fc_policy_mu', _, action_length)

        # In this example, we do not use variance. instead, we use fixed value.
        # policy_var = O.fc('fc_policy_var', _, 1, nonlin=O.softplus)
        # policy_var = O.tile(policy_var, [1, action_length], name='policy_var')
        # policy_std = O.sqrt(policy_var, name='policy_std')

        actor_space = get_env('a3c.actor_space')
        nr_bins = actor_space.shape[1]

        # Instead of using normal distribution, we use Laplacian distribution for policy.
        # And also, we are sampling from a truncated Laplacian distribution (only care the value in the
        # action space). To simplify the computation, we discretize the action space.
        actor_space = O.constant(actor_space)
        actor_space = O.tile(actor_space.add_axis(0), [policy_mu.shape[0], 1, 1])
        policy_mu3 = O.tile(policy_mu.add_axis(2), [1, 1, nr_bins])

        # policy_std3 = O.tile(policy_std.add_axis(2), [1, 1, nr_bins])
        # logits = O.abs(actor_space - policy_mu3) / (policy_std3 + 1e-2)

        # Here, we force the std of the policy to be 1.
        logits_explore = -O.abs(actor_space - policy_mu3)
        policy_explore = O.softmax(logits_explore)

        # Clip the policy for output
        action_range = get_action_range()
        action_range = tuple(map(O.constant, action_range))
        action_range = tuple(map(lambda x: O.tile(x.add_axis(0), [policy_mu.shape[0], 1]), action_range))
        policy_output = O.clip_by_value(policy_mu, *action_range)

        _ = dpc.outputs['feature_v']
        value = O.fc('fc_value', _, 1)
        value = value.remove_axis(1, name='value')

        # Note that, here the policy_explore is a discrete policy,
        # and policy is actually the continuous one.
        net.add_output(policy_explore, name='policy_explore')
        net.add_output(policy_output, name='policy')
        net.add_output(value, name='value')

        if is_train:
            action = O.placeholder('action', shape=(None, action_length), dtype='int64')
            future_reward = O.placeholder('future_reward', shape=(None, ))
            entropy_beta = O.scalar('entropy_beta', 0.1, trainable=False)

            # Since we discretized the action space, use cross entropy here.
            log_policy = O.log(policy_explore + 1e-4)
            log_pi_a_given_s = (log_policy * O.one_hot(action, nr_bins)).sum(axis=2).sum(axis=1)
            advantage = (future_reward - O.zero_grad(value)).rename('advantage')

            # Important trick: using only positive advantage to perform gradient assent. This stabilizes the training.
            advantage = advantage * O.zero_grad((advantage > 0.).astype('float32'))
            policy_loss = O.identity(-(log_pi_a_given_s * advantage).mean(), name='policy_loss')

            # As mentioned, there is no trainable variance.
            # entropy_loss = O.identity(-entropy_beta * (policy_std ** 2.).sum(axis=1).mean(), name='entropy_loss')

            value_loss = O.raw_smooth_l1_loss('raw_value_loss', future_reward, value).mean(name='value_loss')

            loss = O.add_n([policy_cost, value_loss], name='loss')

            net.set_loss(loss)

            for v in [policy_cost, value_loss,
                      value.mean(name='predict_value'), advantage.rms(name='rms_advantage'), loss]:
                summary.scalar(v)

    if is_train:
        env.set_slave_devices(slave_devices)
Esempio n. 13
0
 def inputs():
     state = O.placeholder('state', shape=(None, input_length))
     return [state]
Esempio n. 14
0
def make_network(env):
    with env.create_network() as net:
        n = 2
        nr_classes = get_env('dataset.nr_classes')

        conv2d = functools.partial(O.conv2d,
                                   kernel=3,
                                   use_bias=False,
                                   padding='SAME')
        conv_bn_relu = functools.partial(conv2d, nonlin=O.bn_relu)

        dpc = env.create_dpcontroller()
        with dpc.activate():

            def inputs():
                h, w, c = 32, 32, 3
                img = O.placeholder('img', shape=(None, h, w, c))
                return [img]

            def residual(name, x, first=False, inc_dim=False):
                in_channel = x.static_shape[3]
                out_channel = in_channel
                stride = 1
                if inc_dim:
                    out_channel = in_channel * 2
                    stride = 2
                with env.variable_scope(name):
                    _ = x if first else O.bn_relu(x)
                    _ = conv_bn_relu('conv1', _, out_channel, stride=stride)
                    _ = conv2d('conv2', _, out_channel)
                    if inc_dim:
                        x = O.pooling2d('pool', x, kernel=2)
                        x = O.pad(x, [[0, 0], [0, 0], [0, 0],
                                      [in_channel // 2, in_channel // 2]])
                print(name, x.static_shape)
                _ = _ + x
                return _

            def forward(img):
                _ = img / 128.0 - 1.0
                _ = conv_bn_relu('conv0', _, 16)
                _ = residual('res1.0', _, first=True)
                for i in range(1, n):
                    _ = residual('res1.{}'.format(i), _)
                _ = residual('res2.0', _, inc_dim=True)
                for i in range(1, n):
                    _ = residual('res2.{}'.format(i), _)
                _ = residual('res3.0', _, inc_dim=True)
                for i in range(1, n):
                    _ = residual('res3.{}'.format(i), _)

                _ = O.batch_norm('bn_last', _)
                _ = O.relu(_)

                _ = _.mean(axis=[1, 2])  # global avg pool

                dpc.add_output(_, name='feature')

            dpc.set_input_maker(inputs).set_forward_func(forward)

        _ = dpc.outputs['feature']
        _ = O.fc('linear', _, nr_classes)

        prob = O.softmax(_, name='prob')
        pred = _.argmax(axis=1).astype('int32', name='pred')
        net.add_output(prob)
        net.add_output(pred)

        if env.phase is env.Phase.TRAIN:
            label = O.placeholder('label', shape=(None, ), dtype='int32')
            loss = O.sparse_softmax_cross_entropy_with_logits(
                logits=_, labels=label).mean()
            loss = O.identity(loss, name='loss')
            net.set_loss(loss)

            accuracy = O.eq(label, pred).astype('float32').mean()
            error = 1. - accuracy

            summary.scalar('accuracy', accuracy)
            summary.scalar('error', error)
            summary.inference.scalar('loss', loss)
            summary.inference.scalar('accuracy', accuracy)
            summary.inference.scalar('error', error)
Esempio n. 15
0
 def inputs():
     img = O.placeholder('img', shape=(None, h, w, c))
     if env.phase is env.Phase.TRAIN:
         return [img]
     else:
         return []
Esempio n. 16
0
def make_network(env):
    is_train = env.phase is env.Phase.TRAIN
    if is_train:
        slave_devices = env.slave_devices
        env.set_slave_devices([])

    with env.create_network() as net:
        h, w, c = get_input_shape()

        dpc = env.create_dpcontroller()
        with dpc.activate():

            def inputs():
                state = O.placeholder('state', shape=(None, h, w, c))
                return [state]

            def forward(x):
                _ = x / 255.0
                with O.argscope(O.conv2d, nonlin=O.relu):
                    _ = O.conv2d('conv0', _, 32, 5)
                    _ = O.max_pooling2d('pool0', _, 2)
                    _ = O.conv2d('conv1', _, 32, 5)
                    _ = O.max_pooling2d('pool1', _, 2)
                    _ = O.conv2d('conv2', _, 64, 4)
                    _ = O.max_pooling2d('pool2', _, 2)
                    _ = O.conv2d('conv3', _, 64, 3)

                dpc.add_output(_, name='feature')

            dpc.set_input_maker(inputs).set_forward_func(forward)

        _ = dpc.outputs['feature']
        _ = O.fc('fc0', _, 512, nonlin=O.p_relu)
        policy = O.fc('fc_policy', _, get_player_nr_actions())
        value = O.fc('fc_value', _, 1)

        expf = O.scalar('explore_factor', 1, trainable=False)
        policy_explore = O.softmax(policy * expf, name='policy_explore')

        policy = O.softmax(policy, name='policy')
        value = value.remove_axis(1, name='value')

        net.add_output(policy_explore, name='policy_explore')
        net.add_output(policy, name='policy')
        net.add_output(value, name='value')

        if is_train:
            action = O.placeholder('action', shape=(None, ), dtype='int64')
            future_reward = O.placeholder('future_reward', shape=(None, ))

            log_policy = O.log(policy + 1e-6)
            log_pi_a_given_s = (
                log_policy *
                O.one_hot(action, get_player_nr_actions())).sum(axis=1)
            advantage = (future_reward -
                         O.zero_grad(value)).rename('advantage')
            policy_cost = (log_pi_a_given_s *
                           advantage).mean(name='policy_cost')
            xentropy_cost = (-policy *
                             log_policy).sum(axis=1).mean(name='xentropy_cost')
            value_loss = O.raw_l2_loss('raw_value_loss', future_reward,
                                       value).mean(name='value_loss')
            entropy_beta = O.scalar('entropy_beta', 0.01, trainable=False)
            loss = O.add_n(
                [-policy_cost, -xentropy_cost * entropy_beta, value_loss],
                name='loss')

            net.set_loss(loss)

            for v in [
                    policy_cost, xentropy_cost, value_loss,
                    value.mean(name='predict_value'),
                    advantage.rms(name='rms_advantage'), loss
            ]:
                summary.scalar(v)

    if is_train:
        env.set_slave_devices(slave_devices)
Esempio n. 17
0
 def inputs():
     img = O.placeholder('img', shape=(None, h, w, c))
     return [img]
def make_network(env):
    with env.create_network() as net:
        state = O.placeholder('state', shape=(None, ) + get_input_shape())
        logits = O.fc('fc', state, get_action_shape())
        net.add_output(logits, name='policy')
Esempio n. 19
0
def make_rpredictor_network(env):
    is_train = env.phase is env.Phase.TRAIN

    with env.create_network() as net:
        h, w, c = get_input_shape()
        # Hack(MJY):: forced RGB input (instead of combination of history frames)
        c = 3

        dpc = env.create_dpcontroller()
        with dpc.activate():

            def inputs():
                state = O.placeholder('state', shape=(None, h, w, c))
                t1_state = O.placeholder('t1_state', shape=(None, h, w, c))
                t2_state = O.placeholder('t2_state', shape=(None, h, w, c))
                return [state, t1_state, t2_state]

            @O.auto_reuse
            def forward_conv(x):
                _ = x / 255.0
                with O.argscope(O.conv2d, nonlin=O.relu):
                    _ = O.conv2d('conv0', _, 32, 5)
                    _ = O.max_pooling2d('pool0', _, 2)
                    _ = O.conv2d('conv1', _, 32, 5)
                    _ = O.max_pooling2d('pool1', _, 2)
                    _ = O.conv2d('conv2', _, 64, 4)
                    _ = O.max_pooling2d('pool2', _, 2)
                    _ = O.conv2d('conv3', _, 64, 3)
                return _

            def forward(x, t1, t2):
                dpc.add_output(forward_conv(x), name='feature')
                dpc.add_output(forward_conv(t1), name='t1_feature')
                dpc.add_output(forward_conv(t2), name='t2_feature')

            dpc.set_input_maker(inputs).set_forward_func(forward)

        @O.auto_reuse
        def forward_fc(feature, action):
            action = O.one_hot(action, get_player_nr_actions())
            _ = O.concat([feature.flatten2(), action], axis=1)
            _ = O.fc('fc0', _, 512, nonlin=O.p_relu)
            reward = O.fc('fc_reward', _, 1)
            return reward

        action = O.placeholder('action', shape=(None, ), dtype='int64')
        net.add_output(forward_fc(dpc.outputs['feature'], action),
                       name='reward')

        if is_train:
            t1_action = O.placeholder('t1_action',
                                      shape=(None, ),
                                      dtype='int64')
            t1_reward_exp = O.exp(
                forward_fc(dpc.outputs['t1_feature'], t1_action).sum())
            t2_action = O.placeholder('t2_action',
                                      shape=(None, ),
                                      dtype='int64')
            t2_reward_exp = O.exp(
                forward_fc(dpc.outputs['t2_feature'], t2_action).sum())

            pref = O.placeholder('pref')
            pref = O.callback_injector(pref)
            p1, p2 = 1 - pref, pref

            p_greater = t1_reward_exp / (t1_reward_exp + t2_reward_exp)
            loss = -p1 * O.log(p_greater) - p2 * O.log(1 - p_greater)

            net.set_loss(loss)
Esempio n. 20
0
def make_network(env):
    use_linear_vr = get_env('trpo.use_linear_vr')

    with env.create_network() as net:
        net.dist = O.distrib.GaussianDistribution('policy',
                                                  size=get_action_shape()[0],
                                                  fixed_std=False)
        if use_linear_vr:
            from tartist.app.rl.utils.math import LinearValueRegressor
            net.value_regressor = LinearValueRegressor()

        state = O.placeholder('state', shape=(None, ) + get_input_shape())
        # state = O.moving_average(state)
        # state = O.clip_by_value(state, -10, 10)
        batch_size = state.shape[0]

        # We have to define variable scope here for later optimization.

        with env.variable_scope('policy'):
            _ = state

            with O.argscope(O.fc):
                _ = O.fc('fc1', _, 64, nonlin=O.relu)
                _ = O.fc('fc2', _, 64, nonlin=O.relu)
                mu = O.fc('fc_mu', _, net.dist.sample_size, nonlin=O.tanh)
                logstd = O.variable(
                    'logstd',
                    O.truncated_normal_initializer(stddev=0.01),
                    shape=(net.dist.sample_size, ),
                    trainable=True)

            logstd = O.tile(logstd.add_axis(0), [batch_size, 1])
            theta = O.concat([mu, logstd], axis=1)

            policy = net.dist.sample(batch_size=batch_size,
                                     theta=theta,
                                     process_theta=True)
            policy = O.clip_by_value(policy, -1, 1)

            net.add_output(theta, name='theta')
            net.add_output(policy, name='policy')

        if env.phase == env.Phase.TRAIN:
            theta_old = O.placeholder('theta_old',
                                      shape=(None, net.dist.param_size))
            action = O.placeholder('action',
                                   shape=(None, net.dist.sample_size))
            advantage = O.placeholder('advantage', shape=(None, ))

            log_prob = net.dist.log_likelihood(action,
                                               theta,
                                               process_theta=True)
            log_prob_old = net.dist.log_likelihood(action,
                                                   theta_old,
                                                   process_theta=True)

            # Importance sampling of surrogate loss (L in paper).
            ratio = O.exp(log_prob - log_prob_old)
            policy_loss = -O.reduce_mean(ratio * advantage)

            kl = net.dist.kl(theta_p=theta_old,
                             theta_q=theta,
                             process_theta=True).mean()
            kl_self = net.dist.kl(theta_p=O.zero_grad(theta),
                                  theta_q=theta,
                                  process_theta=True).mean()
            entropy = net.dist.entropy(theta, process_theta=True).mean()

            net.add_output(policy_loss, name='policy_loss')
            net.add_output(kl, name='kl')
            net.add_output(kl_self, name='kl_self')

            summary.scalar('policy_entropy',
                           entropy,
                           collections=[rl.train.ACGraphKeys.POLICY_SUMMARIES])

        if not use_linear_vr:
            with env.variable_scope('value'):
                value = O.fc('fcv', state, 1)
                net.add_output(value, name='value')

            if env.phase == env.Phase.TRAIN:
                value_label = O.placeholder('value_label', shape=(None, ))
                value_loss = O.raw_l2_loss('raw_value_loss', value,
                                           value_label).mean(name='value_loss')
                net.add_output(value_loss, name='value_loss')
Esempio n. 21
0
 def inputs():
     state = O.placeholder('state', shape=(None, h, w, c))
     return [state]
 def inputs():
     img_a = O.placeholder('img_a', shape=(None, h, w, c))
     img_b = O.placeholder('img_b', shape=(None, h, w, c))
     return [img_a, img_b]