def embed(X, we): we = convert_gradient_to_tensor(we) """ tf.gather(params, indices): axis: Defaults to the first non-batch dimension. (e.g.) >>> a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) >>> tf.gather(a, [1,2]) <tf.Tensor: id=15, shape=(2, 3), dtype=float32, numpy= array([[4., 5., 6.], [7., 8., 9.]], dtype=float32)> [reference] https://www.tensorflow.org/api_docs/python/tf/gather https://bit.ly/3iG15cu shape of we: [n_vocab + n_special + n_ctx, n_embd] shape of X: [batch_size * 2(x12 and x13), n_ctx, 2] ? shape of e: [batch_size * 2(x12 and x13), n_ctx, 2, n_embd] """ e = tf.gather(we, X) """ ? shape of h: [batch_size * 2(x12 and x13), n_ctx, n_embd] """ h = tf.reduce_sum(e, 2) return h
def embed(X, we): """ X : batch * ctx_len we : vocab_len * embedding_size """ we = convert_gradient_to_tensor(we) e = tf.gather(we, X) return e
def embed(X, we): we = convert_gradient_to_tensor(we) print("22222we", we) e = tf.gather(we, X) print("333e", e) h = tf.reduce_sum(e, 2) print("3333h", h) return h
def embed(X, we): """ For rocstories: - we:[(vocab_size + 3 + 77), 768], - X:[length_of_input_sequences in 1-D tensors of [77, 2]]; (3: n_special). e = tf.gather(): - get embeddings of X from we (weight of all embeddings) - So e: would give a [?, 77, 2, 768] where ?: length of input batch sequence for the current /gpu:X h = tf.reduce_sum(): returns a [?, 77, 768] (i.e performs sum along axis 2: add pos. embeds to the input embed) """ we = utils.convert_gradient_to_tensor(we) e = tf.gather(we, X) h = tf.reduce_sum(e, 2) return h
def embed(X, we): we = convert_gradient_to_tensor( we) ## we: [n_vocab+n_special+n_ctx, n_embd], X: [-1, n_ctx, 2] e = tf.gather(we, X) ## input에 대한 ebeedding vector [-1, n_ctx, 2, n_embd] h = tf.reduce_sum(e, 2) ## h: [-1, 1, n_embed] return h