def positional_encoding(inputs, num_units, zero_pad=True, scale=True, scope="positional_encoding", reuse=None): N, T = inputs.get_shape().as_list() with tf.variable_scope(scope, reuse=True): postion_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1]) postion_enc = np.array([[ pos / np.power(10000, 2. * i / num_units) for i in range(num_units) ] for pos in range(T)]) postion_enc[:, 0::2] = np.sin(postion_enc[:, 0::2]) postion_enc[:, 1::2] = np.cos(postion_enc[:, 1::2]) lookup_table = tf.convert_to_tensor(postion_enc) if zero_pad: lookup_table = tf.concat( (tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0) outputs = tf.nn.embedding_lookup(lookup_table, postion_ind) if scale: outputs = outputs * num_units**0.5 return outputs
def main(unused_args): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config) with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config) mtest = PTBModel(is_training=False, config=eval_config) tf.initialize_all_variables().run() for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, train_data, m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op()) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest, test_data, tf.no_op())
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size # batch_size self.num_steps = num_steps = config.num_steps # size = config.hidden_size # 隐藏层 vocab_size = config.vocab_size # 词表size # 输入占位符 self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self._initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) outputs = [] states = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) states.append(state) output = tf.reshape(tf.concat(outputs, 1), [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.matmul(output, softmax_w) + softmax_b loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])], vocab_size) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = states[-1] if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def scaled_dotproduct_attention(queries, keys, num_unit=None, num_heads=0, dropout_rate=0, is_tranining=True, causality=False, scope="scaled_att", reuse=None): with tf.variable_scope(scope, reuse=reuse): if num_unit is None: num_unit = queries.get_shape().as_list[-1] # 线性变换 Q = tf.layers.dense(queries, num_unit, activation=tf.nn.relu) K = tf.layers.dense(keys, num_unit, activation=tf.nn.relu) V = tf.layers.dense(keys, num_unit, activation=tf.nn.relu) outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) outputs = outputs / (K.get_shape().as_list()[-1]**0.5) # 对填充的部分进行mask,这些位置att score变得极小, key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) paddings = tf.ones_like(outputs) * (-2**32 + 1) outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # 一个mask操作,对模型屏蔽未来信息 if causality: diag_vals = tf.ones_like(outputs[0, :, :]) tril = tf.contrib.linalg.LinearOperatorTril(diag_vals).to_dense() masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) paddings = tf.ones_like(masks) * (-2**32 + 1) outputs = tf.where(tf.equal(masks, 0), paddings, outputs) outputs = tf.nn.softmax(outputs) # Query mask query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) outputs *= query_masks outputs = tf.layers.dropout( outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_tranining)) # 加权平均 outputs = tf.matmul(outputs, V) # outputs += queries outputs = normalize(outputs) return outputs
def normalize(inputs, epsilon=1e-8, scope="ln", reuse=None): with tf.variable_scope(scope, reuse=reuse): inputs_shape = inputs.get_shape() param_shape = inputs_shape[-1:] mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) beta = tf.Variable(tf.zeros(param_shape)) gamma = tf.Variable(tf.ones(param_shape)) normalized = (inputs - mean) / ((variance + epsilon)**(.5)) outputs = gamma * normalized + beta return outputs
def conv_network(x_dict, n_classes, dropout, reuse, is_training): with tf.variable_scope('ConvNetwork', reuse=reuse): x = x_dict['images'] x = tf.reshape(x, shape=[-1, 28, 28, 1]) conv1 = tf.layers.conv2d(x, 32, 5, activation=tf.nn.relu) conv1 = tf.layers.max_pooling2d(conv1, 2, 2, padding='SAME') conv2 = tf.layers.conv2d(conv1, 64, 3, activation=tf.nn.relu) conv2 = tf.layers.max_pooling2d(conv2, 2, 2) # 全连接层,需要把上一个输入拉平 fc1 = tf.contrib.layers.flatten(conv2) fc1 = tf.layers.dense(fc1, 1024) fc1 = tf.layers.dropout(fc1, rate=dropout, training=is_training) out = tf.layers.dense(fc1, n_classes) return out
def embedding(inputs, vocab_size, num_units, zero_pad=True, scale=True, scope="embedding", reuse=None): with tf.variable_scope(scope, reuse=reuse): lookup_table = tf.get_variable( 'lookup_table', dtype=tf.float32, shape=[vocab_size, num_units], initializer=tf.contrib.layers.xavier_initializer()) if zero_pad: lookup_table = tf.concat( (tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0) outputs = tf.nn.embedding_lookup(lookup_table, inputs) if scale: outputs = outputs * (num_units**0.5) return outputs
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decode inputs self.decode_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) de2idx, idx2de = load_de_vocab() en2idx, idx2de = load_en_vocab() with tf.variable_scope("encoder"): # embedding self.enc = embedding(self.x, vocab_size=len(de2idx), zero_pad=True, scale=True, scope="enc_embed") # pos embedding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=True, scale=False, scope="enc_pos") else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pos")
def conv_net(x, n_classes, dropout, reuse, is_training): with tf.variable_scope('ConvNet', reuse=reuse): x = tf.reshape(x, shape=[-1, 28, 28, 1]) x = tf.layers.conv2d(x, 64, 5, activation=tf.nn.relu) x = tf.layers.max_pooling2d(x, 2, 2) x = tf.layers.conv2d(x, 256, 3, activation=tf.nn.relu) x = tf.layers.conv2d(x, 512, 3, activation=tf.nn.relu) x = tf.layers.max_pooling2d(x, 2, 2) x = tf.contrib.layers.flatten(x) # 全连接层 x = tf.layers.dense(x, 2048) x = tf.layers.dropout(x, rate=dropout, training=is_training) x = tf.layers.dense(x, 1024) x = tf.layers.dropout(x, rate=dropout, training=is_training) out = tf.layers.dense(x, n_classes) out = tf.nn.softmax(out) if not is_training else out return out
def feedforward(inputs, num_units=[2048, 512], scope="forward", reuse=None): with tf.variable_scope(scope, reuse=reuse): params = { "inputs": inputs, "filters": num_units[0], "kernel_size": 1, "activation": tf.nn.relu, "use_bias": True } outputs = tf.layers.conv1d(**params) # readout layer params = { "inputs": inputs, "filters": num_units[1], "kernel_size": 1, "activation": None, "use_bias": True } outputs = tf.layers.conv1d(**params) outputs += inputs outputs = normalize(outputs) return outputs
def inference(images): # 构造模型 # 卷积层1 with tf.variable_scope('conv1') as scope: kernel = _variable_with_weight_decay('weights', shape=[5, 5, 3, 64], stddev=1e-4, wd=0.0) conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME') biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0)) bias = tf.nn.bias_add(conv, biases) conv1 = tf.nn.relu(bias, name=scope.name) _activation_summary(conv1) # 池化层1 pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name="pool1") # 正则化 norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm1') # 卷积层2 with tf.variable_scope('conv2') as scope: kernel = _variable_with_weight_decay('weights', shape=[5, 5, 64, 64], stddev=1e-4, wd=0.0) conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME') biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1)) bias = tf.nn.bias_add(conv, biases) conv2 = tf.nn.relu(bias, name=scope.name) _activation_summary(conv2) # 正则化2 norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm2') # 池化层2 pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool2') # 线性修正的全连接层,拉平全连接层 with tf.variable_scope('local3') as scope: dim = 1 # 把 上一层输出的形状拉平 for d in pool2.get_shape()[1:].as_list(): dim *= d reshape = tf.reshape(pool2, [FLAGS.batch_size, dim]) weights = _variable_with_weight_decay('weights', shape=[dim, 384], stddev=0.04, wd=0.004) biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1)) local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name) _activation_summary(local3) # 线性修正的全连接层。 with tf.variable_scope('local4') as scope: weights = _variable_with_weight_decay('weights', shape=[384, 192], stddev=0.04, wd=0.004) biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1)) local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name) _activation_summary(local4) # softmax层 with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES], stddev=1 / 192.0, wd=0.0) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0)) softmax_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name) _activation_summary(softmax_linear) return softmax_linear
def multihead_attention(queries, keys, num_units=None, num_heads=0, dropout_rate=0, is_training=True, causality=False, scope="multihead_attention", reuse=None): with tf.variable_scope(scope, reuse=reuse): if num_units is None: num_units = queries.get_shape().as_list()[-1] # linear projection Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # split and concat Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) outputs = outputs / (K_.get_shape().as_list()[-1]**0.5) # mask key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) key_masks = tf.tile(key_masks, [num_heads, 1]) key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) paddings = tf.ones_like(outputs) * (-2**32 + 1) outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # masked from future if causality: diag_vals = tf.ones_like(outputs[0, :, :]) tril = tf.contrib.linalg.LinearOperatorTril(diag_vals).to_dense() masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) paddings = tf.ones_like(masks) * (-2**32 + 1) outputs = tf.where(tf.equal(masks, 0), paddings, outputs) outputs = tf.nn.softmax(outputs) # query mask query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) query_masks = tf.tile(query_masks, [num_heads, 1]) query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) outputs *= query_masks outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) outputs = tf.matmul(outputs, V_) # restore shape outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) outputs += queries outputs = normalize(outputs) return outputs