def positional_encoding(inputs,
                        num_units,
                        zero_pad=True,
                        scale=True,
                        scope="positional_encoding",
                        reuse=None):
    N, T = inputs.get_shape().as_list()
    with tf.variable_scope(scope, reuse=True):
        postion_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1])
        postion_enc = np.array([[
            pos / np.power(10000, 2. * i / num_units) for i in range(num_units)
        ] for pos in range(T)])
        postion_enc[:, 0::2] = np.sin(postion_enc[:, 0::2])
        postion_enc[:, 1::2] = np.cos(postion_enc[:, 1::2])

        lookup_table = tf.convert_to_tensor(postion_enc)

        if zero_pad:
            lookup_table = tf.concat(
                (tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, postion_ind)

        if scale:
            outputs = outputs * num_units**0.5
    return outputs
def main(unused_args):
    if not FLAGS.data_path:
        raise ValueError("Must set --data_path to PTB data directory")

    raw_data = reader.ptb_raw_data(FLAGS.data_path)
    train_data, valid_data, test_data, _ = raw_data

    config = get_config()
    eval_config = get_config()
    eval_config.batch_size = 1
    eval_config.num_steps = 1

    with tf.Graph().as_default(), tf.Session() as session:
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)
        with tf.variable_scope("model", reuse=None, initializer=initializer):
            m = PTBModel(is_training=True, config=config)
        with tf.variable_scope("model", reuse=True, initializer=initializer):
            mvalid = PTBModel(is_training=False, config=config)
            mtest = PTBModel(is_training=False, config=eval_config)

        tf.initialize_all_variables().run()

        for i in range(config.max_max_epoch):
            lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0)
            m.assign_lr(session, config.learning_rate * lr_decay)

            print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
            train_perplexity = run_epoch(session,
                                         m,
                                         train_data,
                                         m.train_op,
                                         verbose=True)

            print("Epoch: %d Train Perplexity: %.3f" %
                  (i + 1, train_perplexity))
            valid_perplexity = run_epoch(session, mvalid, valid_data,
                                         tf.no_op())
            print("Epoch: %d Valid Perplexity: %.3f" %
                  (i + 1, valid_perplexity))
        test_perplexity = run_epoch(session, mtest, test_data, tf.no_op())
    def __init__(self, is_training, config):
        self.batch_size = batch_size = config.batch_size  # batch_size
        self.num_steps = num_steps = config.num_steps  #
        size = config.hidden_size  # 隐藏层
        vocab_size = config.vocab_size  # 词表size
        # 输入占位符
        self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])

        lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
        if is_training and config.keep_prob < 1:
            lstm_cell = rnn_cell.DropoutWrapper(
                lstm_cell, output_keep_prob=config.keep_prob)
        cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers)

        self._initial_state = cell.zero_state(batch_size, tf.float32)

        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, size])
            inputs = tf.nn.embedding_lookup(embedding, self._input_data)

        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        outputs = []
        states = []
        state = self._initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0:
                    tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)
                states.append(state)

        output = tf.reshape(tf.concat(outputs, 1), [-1, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        logits = tf.matmul(output, softmax_w) + softmax_b
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(self._targets, [-1])],
            [tf.ones([batch_size * num_steps])], vocab_size)
        self._cost = cost = tf.reduce_sum(loss) / batch_size
        self._final_state = states[-1]

        if not is_training:
            return
        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def scaled_dotproduct_attention(queries,
                                keys,
                                num_unit=None,
                                num_heads=0,
                                dropout_rate=0,
                                is_tranining=True,
                                causality=False,
                                scope="scaled_att",
                                reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        if num_unit is None:
            num_unit = queries.get_shape().as_list[-1]
        # 线性变换
        Q = tf.layers.dense(queries, num_unit, activation=tf.nn.relu)
        K = tf.layers.dense(keys, num_unit, activation=tf.nn.relu)
        V = tf.layers.dense(keys, num_unit, activation=tf.nn.relu)

        outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1]))
        outputs = outputs / (K.get_shape().as_list()[-1]**0.5)

        # 对填充的部分进行mask,这些位置att score变得极小,
        key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))
        key_masks = tf.tile(tf.expand_dims(key_masks, 1),
                            [1, tf.shape(queries)[1], 1])

        paddings = tf.ones_like(outputs) * (-2**32 + 1)
        outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs)

        # 一个mask操作,对模型屏蔽未来信息
        if causality:
            diag_vals = tf.ones_like(outputs[0, :, :])
            tril = tf.contrib.linalg.LinearOperatorTril(diag_vals).to_dense()
            masks = tf.tile(tf.expand_dims(tril, 0),
                            [tf.shape(outputs)[0], 1, 1])

            paddings = tf.ones_like(masks) * (-2**32 + 1)
            outputs = tf.where(tf.equal(masks, 0), paddings, outputs)
        outputs = tf.nn.softmax(outputs)
        # Query mask
        query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1)))
        query_masks = tf.tile(tf.expand_dims(query_masks, -1),
                              [1, 1, tf.shape(keys)[1]])
        outputs *= query_masks
        outputs = tf.layers.dropout(
            outputs,
            rate=dropout_rate,
            training=tf.convert_to_tensor(is_tranining))
        # 加权平均
        outputs = tf.matmul(outputs, V)
        #
        outputs += queries
        outputs = normalize(outputs)
    return outputs
def normalize(inputs, epsilon=1e-8, scope="ln", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        inputs_shape = inputs.get_shape()
        param_shape = inputs_shape[-1:]

        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta = tf.Variable(tf.zeros(param_shape))
        gamma = tf.Variable(tf.ones(param_shape))
        normalized = (inputs - mean) / ((variance + epsilon)**(.5))
        outputs = gamma * normalized + beta

    return outputs
def conv_network(x_dict, n_classes, dropout, reuse, is_training):

    with tf.variable_scope('ConvNetwork', reuse=reuse):
        x = x_dict['images']

        x = tf.reshape(x, shape=[-1, 28, 28, 1])

        conv1 = tf.layers.conv2d(x, 32, 5, activation=tf.nn.relu)
        conv1 = tf.layers.max_pooling2d(conv1, 2, 2, padding='SAME')

        conv2 = tf.layers.conv2d(conv1, 64, 3, activation=tf.nn.relu)
        conv2 = tf.layers.max_pooling2d(conv2, 2, 2)
        # 全连接层,需要把上一个输入拉平
        fc1 = tf.contrib.layers.flatten(conv2)

        fc1 = tf.layers.dense(fc1, 1024)

        fc1 = tf.layers.dropout(fc1, rate=dropout, training=is_training)
        out = tf.layers.dense(fc1, n_classes)
    return out
def embedding(inputs,
              vocab_size,
              num_units,
              zero_pad=True,
              scale=True,
              scope="embedding",
              reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        lookup_table = tf.get_variable(
            'lookup_table',
            dtype=tf.float32,
            shape=[vocab_size, num_units],
            initializer=tf.contrib.layers.xavier_initializer())
        if zero_pad:
            lookup_table = tf.concat(
                (tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, inputs)

        if scale:
            outputs = outputs * (num_units**0.5)
    return outputs
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decode inputs
            self.decode_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)

            de2idx, idx2de = load_de_vocab()
            en2idx, idx2de = load_en_vocab()
            with tf.variable_scope("encoder"):
                # embedding
                self.enc = embedding(self.x,
                                     vocab_size=len(de2idx),
                                     zero_pad=True,
                                     scale=True,
                                     scope="enc_embed")
                # pos embedding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=True,
                                                    scale=False,
                                                    scope="enc_pos")
                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pos")
Beispiel #9
0
def conv_net(x, n_classes, dropout, reuse, is_training):
    with tf.variable_scope('ConvNet', reuse=reuse):
        x = tf.reshape(x, shape=[-1, 28, 28, 1])

        x = tf.layers.conv2d(x, 64, 5, activation=tf.nn.relu)
        x = tf.layers.max_pooling2d(x, 2, 2)

        x = tf.layers.conv2d(x, 256, 3, activation=tf.nn.relu)
        x = tf.layers.conv2d(x, 512, 3, activation=tf.nn.relu)

        x = tf.layers.max_pooling2d(x, 2, 2)

        x = tf.contrib.layers.flatten(x)
        # 全连接层
        x = tf.layers.dense(x, 2048)
        x = tf.layers.dropout(x, rate=dropout, training=is_training)

        x = tf.layers.dense(x, 1024)
        x = tf.layers.dropout(x, rate=dropout, training=is_training)

        out = tf.layers.dense(x, n_classes)
        out = tf.nn.softmax(out) if not is_training else out
    return out
def feedforward(inputs, num_units=[2048, 512], scope="forward", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        params = {
            "inputs": inputs,
            "filters": num_units[0],
            "kernel_size": 1,
            "activation": tf.nn.relu,
            "use_bias": True
        }
        outputs = tf.layers.conv1d(**params)

        # readout layer
        params = {
            "inputs": inputs,
            "filters": num_units[1],
            "kernel_size": 1,
            "activation": None,
            "use_bias": True
        }
        outputs = tf.layers.conv1d(**params)

        outputs += inputs
        outputs = normalize(outputs)
    return outputs
Beispiel #11
0
def inference(images):

    # 构造模型
    # 卷积层1
    with tf.variable_scope('conv1') as scope:
        kernel = _variable_with_weight_decay('weights',
                                             shape=[5, 5, 3, 64],
                                             stddev=1e-4,
                                             wd=0.0)
        conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
        bias = tf.nn.bias_add(conv, biases)
        conv1 = tf.nn.relu(bias, name=scope.name)
        _activation_summary(conv1)
    # 池化层1
    pool1 = tf.nn.max_pool(conv1,
                           ksize=[1, 3, 3, 1],
                           strides=[1, 2, 2, 1],
                           padding='SAME',
                           name="pool1")
    # 正则化
    norm1 = tf.nn.lrn(pool1,
                      4,
                      bias=1.0,
                      alpha=0.001 / 9.0,
                      beta=0.75,
                      name='norm1')

    # 卷积层2
    with tf.variable_scope('conv2') as scope:
        kernel = _variable_with_weight_decay('weights',
                                             shape=[5, 5, 64, 64],
                                             stddev=1e-4,
                                             wd=0.0)
        conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
        bias = tf.nn.bias_add(conv, biases)
        conv2 = tf.nn.relu(bias, name=scope.name)
        _activation_summary(conv2)
    # 正则化2
    norm2 = tf.nn.lrn(conv2,
                      4,
                      bias=1.0,
                      alpha=0.001 / 9.0,
                      beta=0.75,
                      name='norm2')
    # 池化层2
    pool2 = tf.nn.max_pool(norm2,
                           ksize=[1, 3, 3, 1],
                           strides=[1, 2, 2, 1],
                           padding='SAME',
                           name='pool2')

    # 线性修正的全连接层,拉平全连接层
    with tf.variable_scope('local3') as scope:
        dim = 1
        # 把 上一层输出的形状拉平
        for d in pool2.get_shape()[1:].as_list():
            dim *= d
        reshape = tf.reshape(pool2, [FLAGS.batch_size, dim])
        weights = _variable_with_weight_decay('weights',
                                              shape=[dim, 384],
                                              stddev=0.04,
                                              wd=0.004)
        biases = _variable_on_cpu('biases', [384],
                                  tf.constant_initializer(0.1))

        local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases,
                            name=scope.name)
        _activation_summary(local3)
    # 线性修正的全连接层。
    with tf.variable_scope('local4') as scope:
        weights = _variable_with_weight_decay('weights',
                                              shape=[384, 192],
                                              stddev=0.04,
                                              wd=0.004)
        biases = _variable_on_cpu('biases', [192],
                                  tf.constant_initializer(0.1))
        local4 = tf.nn.relu(tf.matmul(local3, weights) + biases,
                            name=scope.name)
        _activation_summary(local4)

    # softmax层
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
                                              stddev=1 / 192.0,
                                              wd=0.0)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0))
        softmax_linear = tf.add(tf.matmul(local4, weights),
                                biases,
                                name=scope.name)
        _activation_summary(softmax_linear)

    return softmax_linear
def multihead_attention(queries,
                        keys,
                        num_units=None,
                        num_heads=0,
                        dropout_rate=0,
                        is_training=True,
                        causality=False,
                        scope="multihead_attention",
                        reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        if num_units is None:
            num_units = queries.get_shape().as_list()[-1]

        # linear projection
        Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu)
        K = tf.layers.dense(keys, num_units, activation=tf.nn.relu)
        V = tf.layers.dense(keys, num_units, activation=tf.nn.relu)
        # split and concat
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0)
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0)

        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
        outputs = outputs / (K_.get_shape().as_list()[-1]**0.5)
        # mask
        key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))
        key_masks = tf.tile(key_masks, [num_heads, 1])
        key_masks = tf.tile(tf.expand_dims(key_masks, 1),
                            [1, tf.shape(queries)[1], 1])

        paddings = tf.ones_like(outputs) * (-2**32 + 1)
        outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs)

        # masked from future
        if causality:
            diag_vals = tf.ones_like(outputs[0, :, :])
            tril = tf.contrib.linalg.LinearOperatorTril(diag_vals).to_dense()
            masks = tf.tile(tf.expand_dims(tril, 0),
                            [tf.shape(outputs)[0], 1, 1])

            paddings = tf.ones_like(masks) * (-2**32 + 1)
            outputs = tf.where(tf.equal(masks, 0), paddings, outputs)
        outputs = tf.nn.softmax(outputs)

        # query mask
        query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1)))
        query_masks = tf.tile(query_masks, [num_heads, 1])
        query_masks = tf.tile(tf.expand_dims(query_masks, -1),
                              [1, 1, tf.shape(keys)[1]])
        outputs *= query_masks

        outputs = tf.layers.dropout(outputs,
                                    rate=dropout_rate,
                                    training=tf.convert_to_tensor(is_training))

        outputs = tf.matmul(outputs, V_)
        # restore shape
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)

        outputs += queries
        outputs = normalize(outputs)
    return outputs