def model(X, M, Y, train=False, reuse=False): with tf.variable_scope('model', reuse=reuse): we = tf.get_variable("we", [n_vocab+n_special+n_ctx, n_embd], initializer=tf.random_normal_initializer(stddev=0.02)) we = dropout(we, embd_pdrop, train) X = tf.reshape(X, [-1, n_ctx, 2]) M = tf.reshape(M, [-1, n_ctx]) h = embed(X, we) for layer in range(n_layer): h = block(h, 'h%d'%layer, train=train, scale=True) lm_h = tf.reshape(h[:, :-1], [-1, n_embd]) lm_logits = tf.matmul(lm_h, we, transpose_b=True) lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1])) lm_losses = tf.reshape(lm_losses, [shape_list(X)[0], shape_list(X)[1]-1]) lm_losses = tf.reduce_sum(lm_losses*M[:, 1:], 1)/tf.reduce_sum(M[:, 1:], 1) clf_h = tf.reshape(h, [-1, n_embd]) pool_idx = tf.cast(tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32) clf_h = tf.gather(clf_h, tf.range(shape_list(X)[0], dtype=tf.int32)*n_ctx+pool_idx) clf_h = tf.reshape(clf_h, [-1, 2, n_embd]) if train and clf_pdrop > 0: shape = shape_list(clf_h) shape[1] = 1 clf_h = tf.nn.dropout(clf_h, 1-clf_pdrop, shape) clf_h = tf.reshape(clf_h, [-1, n_embd]) clf_logits = clf(clf_h, 1, train=train) clf_logits = tf.reshape(clf_logits, [-1, 2]) clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=clf_logits, labels=Y) return clf_logits, clf_losses, lm_losses
def conv1d(x, scope, nf, rf, w_init=tf.random_normal_initializer(stddev=0.02), b_init=tf.constant_initializer(0), pad='VALID', train=False): """ for train, x: embed_input [? (len of input seq for curr /gpu:X), 77, 768], scope, nf: (n_state (i.e. last element in x shape_list = 768 (for bpe)) * 3), rf: 1, for mlp, nf: (last element in x shape_list) * 4; rf: still = 1 w: [1, nx (last element in input x), nf (last element in input x * 3)] # 768*3 for q, k, v b: nf = (last element in input x * 3) if rf==1: Basically, reshape x and w to multiple 1-D tensors, perform dot-product, add bias and the reshape to output format (see next comment) output format: list of x shape_list except the last value (i.e [?, 77]), concatenated with nf [768 * 3] """ with tf.variable_scope(scope): nx = utils.shape_list(x)[-1] # last value in x shape_list w = tf.get_variable("w", [rf, nx, nf], initializer=w_init) b = tf.get_variable("b", [nf], initializer=b_init) if rf == 1: # faster 1x1 conv c = tf.reshape( tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) + b, utils.shape_list(x)[:-1] + [nf]) else: # was used to train LM c = tf.nn.conv1d(x, w, stride=1, padding=pad) + b return c
def model(X, M, train=False, reuse=False): with tf.variable_scope('model', reuse=reuse): we = tf.get_variable( "we", [N_VOCAB + N_CTX, N_EMBD], initializer=tf.random_normal_initializer(stddev=0.02)) we = dropout(we, EMBD_PDROP, train) X = tf.reshape(X, [-1, N_CTX, 2]) M = tf.reshape(M, [-1, N_CTX]) h = embed(X, we) for layer in range(N_LAYER): h = block(h, 'h%d' % layer, train=train, scale=True) lm_h = tf.reshape(h, [-1, N_EMBD]) lm_logits = tf.reshape( tf.matmul(lm_h, we[:N_VOCAB, :], transpose_b=True), [-1, N_CTX, N_VOCAB]) lm_logits_truncated = tf.reshape(lm_logits[:, :-1], [-1, N_VOCAB]) lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=lm_logits_truncated, labels=tf.reshape(X[:, 1:, 0], [-1])) lm_losses = tf.reshape( lm_losses, [shape_list(X)[0], shape_list(X)[1] - 1]) lm_losses = tf.reduce_sum(lm_losses * M[:, 1:], 1) / tf.reduce_sum( M[:, 1:], 1) return lm_logits, lm_losses
def blocksparse_attention_impl(q, k, v, heads, attn_mode, local_attn_ctx=None, blocksize=32, num_verts=None, vertsize=None): n_ctx = shape_list(q)[1] if attn_mode == 'strided': # Strided attention is implemented on the transposed matrix to provide greater block sparsity q = strided_transpose(q, n_ctx, local_attn_ctx, blocksize) k = strided_transpose(k, n_ctx, local_attn_ctx, blocksize) v = strided_transpose(v, n_ctx, local_attn_ctx, blocksize) n_state = shape_list(q)[-1] // heads bst = get_blocksparse_obj(n_ctx, heads, attn_mode, blocksize, local_attn_ctx, num_verts, vertsize) scale_amount = tf.cast(1.0 / np.sqrt(n_state), tf.float32) w = bst.query_key_op(q, k) w = bst.masked_softmax(w, scale=scale_amount) a = bst.weight_value_op(w, v) if attn_mode == 'strided': n, t, embd = shape_list(a) bT_ctx = n_ctx // local_attn_ctx a = tf.reshape(a, [n, local_attn_ctx, bT_ctx, embd]) a = tf.transpose(a, [0, 2, 1, 3]) a = tf.reshape(a, [n, t, embd]) return a
def model(self, X, M, train=False, reuse=False, num_ps=1): with tf.variable_scope( 'model_lm', reuse=reuse, partitioner=tf.fixed_size_partitioner(num_shards=16)): we = tf.get_variable( "we", [self.n_vocab + self.n_special + n_ctx, n_embd], initializer=tf.random_normal_initializer(stddev=0.02)) we = dropout(we, embd_pdrop, train) X = tf.reshape(X, [-1, n_ctx, 2]) M = tf.reshape(M, [-1, n_ctx]) h = embed(X, we) for layer in range(n_layer): h = block(h, 'h%d' % layer, train=train, scale=True) lm_h = tf.reshape(h[:, :-1], [-1, n_embd]) lm_logits = tf.matmul(lm_h, we, transpose_b=True) lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1])) lm_losses = tf.reshape( lm_losses, [shape_list(X)[0], shape_list(X)[1] - 1]) lm_losses = tf.reduce_sum(lm_losses * M[:, 1:], 1) / tf.reduce_sum( M[:, 1:], 1) return lm_losses
def model(X, M, Y, train=False, reuse=False): ## X: [8, 2, 77, 2], M: [8, 2, 77], Y: [8] with tf.variable_scope('model', reuse=reuse): ### n_vocab: 40478, n_special: 3, n_ctx: 77, n_embed: 768 we = tf.get_variable("we", [n_vocab + n_special + n_ctx, n_embd], initializer=tf.random_normal_initializer( stddev=0.02)) ## we: [40558, 768] we = dropout(we, embd_pdrop, train) X = tf.reshape(X, [-1, n_ctx, 2]) ## X: [16, 77, 2] M = tf.reshape(M, [-1, n_ctx]) ## X: [16, 77] h = embed( X, we) ## h: [-1, n_ctx, n_embed] : h0 = UWe + Wp (이 부분에서 Wp 부분은 안보임) for layer in range(n_layer): ## n_layer: 12 h = block(h, 'h%d' % layer, train=train, scale=True) lm_h = tf.reshape(h[:, :-1], [-1, n_embd]) ## lm_h: [1216, 768] lm_logits = tf.matmul(lm_h, we, transpose_b=True) ## lm_logits: [1216, 40558] lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=lm_logits, labels=tf.reshape( X[:, 1:, 0], [-1])) ## lm_loss: [1216], P(u) = shftmax(hn, WeT) lm_losses = tf.reshape( lm_losses, [shape_list(X)[0], shape_list(X)[1] - 1]) ## lm_loss: [16, 76] lm_losses = tf.reduce_sum(lm_losses * M[:, 1:], 1) / tf.reduce_sum( M[:, 1:], 1) ## lm_loss: [16] clf_h = tf.reshape(h, [-1, n_embd]) ## clf_h: [1232, 768] pool_idx = tf.cast( tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32) ## pool_idx: [16], string length clf_h = tf.gather( clf_h, tf.range(shape_list(X)[0], dtype=tf.int32) * n_ctx + pool_idx ) # clf_h: [16, 768], https://www.tensorflow.org/api_docs/python/tf/gather, https://www.tensorflow.org/api_docs/python/tf/range clf_h = tf.reshape(clf_h, [-1, 2, n_embd]) ## chf_h: [8, 2, 768] if train and clf_pdrop > 0: shape = shape_list(clf_h) shape[1] = 1 clf_h = tf.nn.dropout(clf_h, 1 - clf_pdrop, shape) clf_h = tf.reshape(clf_h, [-1, n_embd]) ## clf_h: [16, 768] clf_logits = clf(clf_h, 1, train=train) ## clf_logits: [16, 1], hm * WyT 실행 clf_logits = tf.reshape(clf_logits, [-1, 2]) ## clf_logits: [8, 2] clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=clf_logits, labels=Y) ## P(u) = shftmax(hm * WyT) return clf_logits, clf_losses, lm_losses
def conv1d(x, scope, nf, rf, w_init=tf.random_normal_initializer(stddev=0.02), b_init=tf.constant_initializer(0), pad='VALID', train=False): with tf.variable_scope(scope): nx = shape_list(x)[-1] w = tf.get_variable("w", [rf, nx, nf], initializer=w_init) b = tf.get_variable("b", [nf], initializer=b_init) if rf == 1: # faster 1x1 conv c = tf.reshape(tf.matmul(tf.reshape( x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, shape_list(x)[:-1]+[nf]) else: # was used to train LM c = tf.nn.conv1d(x, w, stride=1, padding=pad)+b return c
def model_pw(X, M, Y, train=False, reuse=False, ordinal=False): """ X: [batch, n_ctx, 2] M: [batch, n_ctx] Y: [batch] """ with tf.variable_scope('model', reuse=reuse): we = tf.get_variable( "we", [n_vocab + n_special + n_ctx, n_embd], initializer=tf.random_normal_initializer(stddev=0.02)) we = dropout(we, embd_pdrop, train) #transformer blocks h = embed(X, we) for layer in range(n_layer): h = block(h, 'h%d' % layer, train=train, scale=True) #language modeling objective lm_h = tf.reshape(h[:, :-1], [-1, n_embd]) lm_logits = tf.matmul(lm_h, we, transpose_b=True) lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1])) lm_losses = tf.reshape( lm_losses, [shape_list(X)[0], shape_list(X)[1] - 1]) lm_losses = tf.reduce_sum(lm_losses * M[:, 1:], 1) / tf.reduce_sum( M[:, 1:], 1) clf_h = tf.reshape(h, [-1, n_embd]) #get length of each example pool_idx = tf.cast( tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32) #just takes the state of the transformer at the end of the input, I think? clf_h = tf.gather( clf_h, tf.range(shape_list(X)[0], dtype=tf.int32) * n_ctx + pool_idx) #reshape to [batch, embed size] clf_h = tf.reshape(clf_h, [-1, 1, n_embd]) if train and clf_pdrop > 0: shape = shape_list(clf_h) shape[1] = 1 clf_h = tf.nn.dropout(clf_h, 1 - clf_pdrop, shape) #put tensor back into (batch, embed size) shape clf_h = tf.reshape(clf_h, [-1, n_embd]) #linear layer clf_logits = clf_pw(clf_h, train=train, ordinal=ordinal) #final softmax clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=clf_logits, labels=Y) return clf_logits, clf_losses, lm_losses
def model(X, M, Y, Is_train, data_params): n_vocab = data_params['n_vocab'] n_special = data_params['n_special'] max_word = data_params['max_word'] clf_token = data_params['clf_token'] clf_pdrop = 0.1 with tf.variable_scope('transformer', reuse=False): we = tf.get_variable( "we", [n_vocab + n_special + max_word, 768], initializer=tf.random_normal_initializer(stddev=0.02)) we = dropout(we, 0.1, True) X = tf.reshape( X, [-1, max_word, 2]) # (batch * 1sent, 161, 2) == (8,161,2) M = tf.reshape(M, [-1, max_word]) h = embed(X, we) # (8, 161, 768) for layer in range(12): h = block(h, 'h%d' % layer, train=True, scale=True) lm_h = tf.reshape(h[:, :-1], [-1, 768]) lm_logits = tf.matmul(lm_h, we, transpose_b=True) lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1])) lm_losses = tf.reshape( lm_losses, [shape_list(X)[0], shape_list(X)[1] - 1]) lm_losses = tf.reduce_sum(lm_losses * M[:, 1:], 1) / tf.reduce_sum( M[:, 1:], 1) clf_h = tf.reshape(h, [-1, 768]) # h:(8,161,768) clf_h:(1288, 768) pool_idx = tf.cast( tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32) # last: clf_token clf_h = tf.gather( clf_h, tf.range(shape_list(X)[0], dtype=tf.int32) * max_word + pool_idx) # (8,768) clf_h = tf.reshape(clf_h, [-1, 1, 768]) # (4, 2, 768) if True and clf_pdrop > 0: shape = shape_list(clf_h) shape[1] = 1 clf_h = tf.nn.dropout(clf_h, 1 - clf_pdrop, shape) clf_h = tf.reshape(clf_h, [-1, 768]) # 8*1*768 clf_logits = clf(clf_h, 2, train=True) # 1 sent-->3 classes clf_logits = tf.reshape(clf_logits, [-1, 2]) clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=clf_logits, labels=Y) return clf_logits, tf.reduce_mean(clf_losses), tf.reduce_mean( lm_losses)
def conv1d(x, scope, nf, w_init=tf.random_normal_initializer(stddev=0.02), b_init=tf.constant_initializer(0), pad='VALID', train=False): with tf.variable_scope(scope): nx = shape_list(x)[-1] w = tf.get_variable("w", [nx, nf], initializer=w_init) b = tf.get_variable("b", [nf], initializer=b_init) c = tf.reshape( tf.matmul(tf.reshape(x, [-1, nx]), w) + b, shape_list(x)[:-1] + [nf]) return c
def norm(x, scope, axis=[-1]): with tf.variable_scope(scope): n_state = shape_list(x)[-1] g = tf.get_variable("g", [n_state], initializer=tf.constant_initializer(1)) b = tf.get_variable("b", [n_state], initializer=tf.constant_initializer(0)) g, b = get_ema_vars(g, b) return _norm(x, g, b, axis=axis)
def mask_attn_weights(self, w): # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. _, _, nd, ns = utils.shape_list(w) b = utils.attention_mask(nd, ns, dtype=w.dtype) b = torch.reshape(b, [1, 1, nd, ns]) w = w * b - torch.Tensor([1e10]).to(w.dtype) * (1 - b) return w
def conv1d(x, scope, nf, *, w_init_stdev=0.02): with tf.variable_scope(scope): *start, nx = shape_list(x) w = tf.get_variable('w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev)) b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0)) c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, start+[nf]) return c
def mask_attn_weights(w): # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. _, _, nd, ns = shape_list(w) b = attention_mask(nd, ns, dtype=w.dtype) b = tf.reshape(b, [1, 1, nd, ns]) w = w*b - tf.cast(1e10, w.dtype)*(1-b) return w
def dec_block(self, x, scope, train=False, scale=False, encoder_output=None): with tf.variable_scope( scope ): # scope: h%d (initialized with the loaded params_d.npy values) nx = utils.shape_list(x)[-1] a = self.attn(x, 'attn', nx, self.params.n_head, train=train, scale=scale) n = norm(x + a, 'ln_1') if encoder_output is not None: # encoder-decoder attn performed a = self.attn(n, 'attn_enc_dec', nx, self.params.n_head, train=train, scale=scale, encoder_output=encoder_output, use_mask_attn=False) n = norm(n + a, 'ln_enc_dec') m = self.mlp(n, 'mlp', nx * 4, train=train) h = norm(n + m, 'ln_2') return h
def attention_impl(q, k, v, heads, attn_mode, local_attn_ctx=None): q = split_heads(q, heads) k = split_heads(k, heads) v = split_heads(v, heads) n_timesteps = shape_list(k)[2] mask = tf.to_float(get_attn_mask(n_timesteps, attn_mode, local_attn_ctx)) w = tf.matmul(q, k, transpose_b=True) scale_amount = 1.0 / np.sqrt(shape_list(q)[-1]) w = tf.cast(w, tf.float32) w = w * scale_amount w = w * mask + -1e9 * (1 - mask) w = tf.nn.softmax(w) w = tf.cast(w, tf.float16) a = tf.matmul(w, v) a = merge_heads(a) return a
def merge_states(x): """ reshape (batch, pixel, head, head_state) -> (batch, pixel, state) """ x_shape = shape_list(x) new_x_shape = x_shape[:-2] + [np.prod(x_shape[-2:])] return tf.reshape(x, new_x_shape)
def call(self, inputs, **kwargs): if self.rf == 1: c = tf.reshape(tf.matmul(tf.reshape(inputs, [-1, self.nx]), tf.reshape(self.w, [-1, self.nf])) + self.b, shape_list(inputs)[:-1] + [self.nf]) else: c = tf.nn.conv1d(value=inputs, filters=self.w, stride=1, padding='VALID') + self.b return c
def forward(self, X, past): results = {} batch, sequence = utils.shape_list(X) wpe = Variable(torch.randn([self.n_ctx, self.n_embd]), name='wpe') wte = Variable(torch.randn([self.n_vocab, self.n_embd]), name='wte') past_length = 0 if past is None else past.shape[-2] h = wte[X] + wpe[self.positions_for(X, past_length)] # Transformer presents = [] pasts = torch.unbind( past, dim=1) if past is not None else [None] * self.n_layer assert len(pasts) == self.n_layer for layer, past in enumerate(pasts): h, present = self.block(h, past=past) presents.append(present) results['present'] = torch.stack(presents, dim=1) h = self.norm(h) # Language model loss. Do tokens <n predict token n? h_flat = torch.reshape(h, [batch * sequence, self.n_embd]) logits = torch.matmul(h_flat, wte.t()) logits = torch.reshape(logits, [batch, sequence, self.n_vocab]) results['logits'] = logits return results
def block(x, scope, train=False, scale=False): with tf.variable_scope(scope): nx = shape_list(x)[-1] a = attn(x, 'attn', nx, n_head, train=train, scale=scale) n = norm(x+a, 'ln_1') m = mlp(n, 'mlp', nx*4, train=train) h = norm(n+m, 'ln_2') return h
def mlp(x, scope, n_state, train=False): with tf.variable_scope(scope): nx = shape_list(x)[-1] act = act_fns[afn] h = act(conv1d(x, 'c_fc', n_state, 1, train=train)) h2 = conv1d(h, 'c_proj', nx, 1, train=train) h2 = dropout(h2, resid_pdrop, train) return h2
def mask_attn_weights(w): n = shape_list(w)[-1] b = tf.matrix_band_part( tf.ones([n, n]), -1, 0) ## https://www.tensorflow.org/api_docs/python/tf/linalg/band_part b = tf.reshape(b, [1, 1, n, n]) w = w * b + -1e9 * (1 - b) return w
def block(x, scope, train=False, scale=False): with tf.variable_scope(scope): nx = shape_list(x)[-1] a = attn(x, "attn", nx, n_head, train=train, scale=scale) n = norm(x + a, "ln_1") m = mlp(n, "mlp", nx * 4, train=train) h = norm(n + m, "ln_2") return h
def split_states(x, n): """ reshape (batch, pixel, state) -> (batch, pixel, head, head_state) """ x_shape = shape_list(x) m = x_shape[-1] new_x_shape = x_shape[:-1] + [n, m // n] return tf.reshape(x, new_x_shape)
def strided_transpose(x, n_ctx, local_attn_ctx, blocksize): bT_ctx = n_ctx // local_attn_ctx assert bT_ctx % blocksize == 0, f'{bT_ctx}, {blocksize}' n, t, embd = shape_list(x) x = tf.reshape(x, [n, bT_ctx, local_attn_ctx, embd]) x = tf.transpose(x, [0, 2, 1, 3]) x = tf.reshape(x, [n, t, embd]) return x
def multinomial_squeeze(self, logits, temperature=1.0): """multinomial sampling from logits.""" logits_shape = utils.shape_list(logits) reshaped_logits = (tf.reshape(logits, [-1, logits_shape[-1]]) / temperature) choices = tf.multinomial(reshaped_logits, 1) choices = tf.reshape(choices, logits_shape[:-1]) return tf.to_int32(choices)
def mlp(self, x, scope, n_state, train=False): with tf.variable_scope(scope): nx = utils.shape_list(x)[-1] # nx: 768 for bpe act = ACT_FNS[self.params.afn] # gelu h = act(conv1d(x, 'c_fc', n_state, 1, train=train)) h2 = conv1d(h, 'c_proj', nx, 1, train=train) h2 = dropout(h2, self.params.resid_pdrop, train) return h2
def _attn(self, q, k, v): w = tf.matmul(q, k) if self.scale: n_state = shape_list(v)[-1] w = w * tf.rsqrt(tf.cast(n_state, tf.float32)) w = self.mask_attn_weights(w) w = tf.nn.softmax(w) a = tf.matmul(w, v) return a
def split_states(x, n): x_shape = shape_list(x) m = x_shape[-1] """ from [batch, n_ctx, n_embd] to [batch, n_ctx, n_head, n_embd//n_head] """ new_x_shape = x_shape[:-1] + [n, m//n] return tf.reshape(x, new_x_shape)
def clf(x, ny, w_init=tf.random_normal_initializer(stddev=0.02), b_init=tf.constant_initializer(0), train=False): with tf.variable_scope('clf'): nx = shape_list(x)[-1] w = tf.get_variable("w", [nx, ny], initializer=w_init) b = tf.get_variable("b", [ny], initializer=b_init) return tf.matmul(x, w) + b