def language_model(*, X, M, embed_weights, hidden, config, reuse=None): """ A language model output and loss for the language modelling objective described in the original finetune paper. This language model uses weights that are tied to the input embedding. :param X: The raw token ids fed to the featurizer. :param M: A loss mask, with 1's where losses should be counted and 0's elsewhere. :param embed_weights: The word embedding matrix, normally the one returned by the featurizer. :param hidden: Output of the featurizer. :param config: A config object. :param reuse: A Flag passed through to the tf.variable_scope context manager. :return: A dict containing: logits: The un-normalised log-probabilities over each word in the vocabulary. loss: The masked language modelling loss. """ with tf.variable_scope('model', reuse=reuse): # language model ignores last hidden state because we don't have a target lm_h = tf.reshape(hidden[:, :-1], [-1, config.n_embed]) # [batch, seq_len, embed] --> [batch * seq_len, embed] lm_logits = tf.matmul(lm_h, embed_weights, transpose_b=True) # tied weights lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1]) ) lm_losses = tf.reshape(lm_losses, [shape_list(X)[0], shape_list(X)[1] - 1]) lm_losses = tf.reduce_sum(lm_losses * M[:, 1:], 1) / tf.reduce_sum(M[:, 1:], 1) return { 'logits': lm_logits, 'losses': lm_losses, }
def block(x, n_head, act_fn, resid_pdrop, attn_pdrop, scope, dropout_placeholder, train=False, scale=False): with tf.variable_scope(scope): nx = shape_list(x)[-1] a = attn(x, 'attn', nx, n_head, resid_pdrop, attn_pdrop, dropout_placeholder, train=train, scale=scale) n = norm(x + a, 'ln_1') m = mlp(n, 'mlp', nx * 4, act_fn, resid_pdrop, dropout_placeholder, train=train) h = norm(n + m, 'ln_2') return h
def featurizer(X, encoder, dropout_placeholder, config, train=False, reuse=None, max_length=None): """ The transformer element of the finetuning model. Maps from tokens ids to a dense, embedding of the sequence. :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx] :param encoder: A TextEncoder object. :param dropout_placeholder: A placeholder, 1 when dropout is on, 0 when it is off. :param config: A config object, containing all parameters for the featurizer. :param train: If this flag is true, dropout and losses are added to the graph. :param reuse: Should reuse be set within this scope. :param max_length: Maximum sequence length. :return: A dict containing; embed_weights: the word embedding matrix. features: The output of the featurizer_final state. sequence_features: The output of the featurizer at each timestep. """ max_length = max_length or config.max_length with tf.variable_scope('model', reuse=reuse): embed_weights = tf.get_variable( "we", [encoder.vocab_size + max_length, config.n_embed], initializer=tf.random_normal_initializer( stddev=config.weight_stddev)) embed_weights = dropout(embed_weights, config.embed_p_drop, train, dropout_placeholder) X = tf.reshape(X, [-1, max_length, 2]) h = embed(X, embed_weights) for layer in range(config.n_layer): h = block(h, config.n_heads, config.act_fn, config.resid_p_drop, config.attn_p_drop, 'h%d' % layer, dropout_placeholder, train=train, scale=True) # Use hidden state at classifier token as input to final proj. + softmax clf_h = tf.reshape(h, [-1, config.n_embed]) # [batch * seq_len, embed] clf_token = encoder['_classify_'] pool_idx = tf.cast( tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32) clf_h = tf.gather( clf_h, tf.range(shape_list(X)[0], dtype=tf.int32) * max_length + pool_idx) clf_h = tf.reshape(clf_h, [-1, config.n_embed]) # [batch, embed] return { 'embed_weights': embed_weights, 'features': clf_h, 'sequence_features': h }
def mlp(x, scope, n_state, act_fn, resid_pdrop, train=False): with tf.variable_scope(scope): nx = shape_list(x)[-1] act = act_fns[act_fn] h = act(conv1d(x, 'c_fc', n_state, 1, train=train)) h2 = conv1d(h, 'c_proj', nx, 1, train=train) h2 = dropout(h2, resid_pdrop, train) return h2
def featurizer(X, encoder, config, train=False, reuse=None): """ The transformer element of the finetuning model. Maps from tokens ids to a dense, embedding of the sequence. :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx] :param encoder: A TextEncoder object. :param config: A config object, containing all parameters for the featurizer. :param train: If this flag is true, dropout and losses are added to the graph. :param reuse: Should reuse be set within this scope. :return: A dict containing; embed_weights: the word embedding matrix. features: The output of the featurizer_final state. sequence_features: The output of the featurizer at each timestep. """ initial_shape = [a or -1 for a in X.get_shape().as_list()] X = tf.reshape(X, shape=[-1] + initial_shape[-2:]) with tf.variable_scope('model/featurizer', reuse=reuse): embed_weights = tf.get_variable("we", [encoder.vocab_size + config.max_length, config.n_embed], initializer=tf.random_normal_initializer(stddev=config.weight_stddev)) if config.train_embeddings: embed_weights = dropout(embed_weights, config.embed_p_drop, train) else: embed_weights = tf.stop_gradient(embed_weights) X = tf.reshape(X, [-1, config.max_length, 2]) h = embed(X, embed_weights) for layer in range(config.n_layer): if (layer - config.n_layer) == config.num_layers_trained and config.num_layers_trained != 12: h = tf.stop_gradient(h) train_layer = False else: train_layer = train with tf.variable_scope('h%d_' % layer): block_fn = functools.partial(block, n_head=config.n_heads, act_fn=config.act_fn, resid_pdrop=config.resid_p_drop, attn_pdrop=config.attn_p_drop, scope='h%d' % layer, train=train_layer, scale=True) if config.low_memory_mode and train_layer: block_fn = recompute_grad(block_fn, use_entire_scope=True) h = block_fn(h) # Use hidden state at classifier token as input to final proj. + softmax clf_h = tf.reshape(h, [-1, config.n_embed]) # [batch * seq_len, embed] clf_token = encoder['_classify_'] pool_idx = tf.cast(tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32) clf_h = tf.gather(clf_h, tf.range(shape_list(X)[0], dtype=tf.int32) * config.max_length + pool_idx) clf_h = tf.reshape(clf_h, shape=initial_shape[: -2] + [config.n_embed]) seq_feats = tf.reshape(h, shape=initial_shape[:-1] + [config.n_embed]) return { 'embed_weights': embed_weights, 'features': clf_h, 'sequence_features': seq_feats }
def sequence_labeler(hidden, targets, n_outputs, dropout_placeholder, config, train=False, reuse=None, **kwargs): """ An Attention based sequence labeler model. Takes the output of the pre-trained model, applies an additional randomly initialised multihead attention block, with residuals on top. The attention is not-future masked to allow the model to label sequences based on context in both directions. The representations fed into this model are necessarily future masked because a language modelling loss is the original objective of the featurizer. :param hidden: The output of the featurizer. [batch_size, sequence_length, embed_dim] :param targets: The placeholder representing the sequence labeling targets. [batch_size, sequence_length] :param n_outputs: A python int containing the number of classes that the model should be learning to predict over. :param dropout_placeholder: :param config: A config object, containing all parameters for the featurizer. :param train: If this flag is true, dropout and losses are added to the graph. :param reuse: Should reuse be set within this scope. :param kwargs: Spare arguments. :return: dict containing: "logits": The un-normalised log probabilities of each class being in each location. For usable predictions, sampling from this distrobution is not sufficiant and a viterbi decoding method should be used. "losses": The negative log likelihood for the sequence targets. "predict_params": A dictionary of params to be fed to the viterbi decode function. """ with tf.variable_scope('model/clf', reuse=reuse): nx = shape_list(hidden)[-1] a = attn(hidden, 'seq_label_attn', nx, config.seq_num_heads, config.seq_dropout, config.seq_dropout, dropout_placeholder, train=train, scale=False, mask=False) n = norm(hidden + a, 'seq_label_residual') flat_logits = tf.layers.dense(n, n_outputs) logits = tf.reshape(flat_logits, tf.concat([tf.shape(hidden)[:2], [n_outputs]], 0)) # TODO (BEN): ADD: correct way to find lengths. - Same method in decoding. Cheating for now. with tf.device(None): log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood( logits, targets, kwargs.get('max_length') * tf.ones(tf.shape(targets)[0])) return { 'logits': logits, 'losses': -log_likelihood, 'predict_params': { 'transition_matrix': transition_params } }
def norm(x, scope, axis=[-1], e=1e-5): with tf.variable_scope(scope): n_state = shape_list(x)[-1] g = tf.get_variable("g", [n_state], initializer=tf.constant_initializer(1)) b = tf.get_variable("b", [n_state], initializer=tf.constant_initializer(0)) u = tf.reduce_mean(x, axis=axis, keepdims=True) s = tf.reduce_mean(tf.square(x - u), axis=axis, keepdims=True) x = (x - u) * tf.rsqrt(s + e) x = x * g + b return x
def conv1d(x, scope, nf, rf, w_init=tf.random_normal_initializer(stddev=0.02), b_init=tf.constant_initializer(0), pad='VALID', train=False): with tf.variable_scope(scope): nx = shape_list(x)[-1] w = tf.get_variable("w", [rf, nx, nf], initializer=w_init) b = tf.get_variable("b", [nf], initializer=b_init) if rf == 1: # faster 1x1 conv c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) + b, shape_list(x)[:-1] + [nf]) else: # was used to train LM c = tf.nn.conv1d(x, w, stride=1, padding=pad) + b return c
def _attn(q, k, v, attn_pdrop, train=False, scale=False, mask=True): w = tf.matmul(q, k) if scale: n_state = shape_list(v)[-1] w = w * tf.rsqrt(tf.cast(n_state, tf.float32)) if mask: w = mask_attn_weights(w) w = tf.nn.softmax(w) w = dropout(w, attn_pdrop, train) a = tf.matmul(w, v) return a
def perceptron(x, ny, config, w_init=None, b_init=None): """ A very standard linear Perceptron model. :param x: Input tensor. :param ny: Number of outputs. :param config: A config object. :param w_init: Weight initializer. :param b_init: Bias initializer. :return: The output of the perceptron model. """ w_init = w_init or tf.random_normal_initializer(stddev=config.weight_stddev) b_init = b_init or tf.constant_initializer(0) with tf.variable_scope('clf'): nx = shape_list(x)[-1] w = tf.get_variable("w", [nx, ny], initializer=w_init) b = tf.get_variable("b", [ny], initializer=b_init) return tf.matmul(x, w) + b
def multi_choice_question(hidden, targets, n_targets, dropout_placeholder, config, train=False, reuse=None, **kwargs): with tf.variable_scope("model", reuse=reuse): initial_shape = shape_list(hidden) hidden = dropout(hidden, config.clf_p_drop, train, dropout_placeholder) # some model clf_out = perceptron(merge_leading_dims(hidden, 2), n_targets, config) clf_logits = tf.reshape(clf_out, shape=initial_shape[0] + [n_targets]) clf_losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=clf_logits, labels=tf.stop_gradient(targets)) return {'logits': clf_logits, 'losses': clf_losses}
def merge_states(x): x_shape = shape_list(x) new_x_shape = x_shape[:-2] + [np.prod(x_shape[-2:])] return tf.reshape(x, new_x_shape)
def split_states(x, n): x_shape = shape_list(x) m = x_shape[-1] new_x_shape = x_shape[:-1] + [n, m // n] return tf.reshape(x, new_x_shape)
def mask_attn_weights(w): n = shape_list(w)[-1] b = tf.matrix_band_part(tf.ones([n, n]), -1, 0) b = tf.reshape(b, [1, 1, n, n]) w = w * b + -1e9 * (1 - b) return w