forked from sjvasquez/quora-duplicate-questions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
embed.py
124 lines (97 loc) · 4.54 KB
/
embed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import tensorflow as tf
from encode import lstm_encoder
from propogate import temporal_convolution_layer
from utils import shape
def embedding_from_sparse_encodings(encodings, shape, embedding_matrix=None, scope='gather-embed',
reuse=False):
"""
Gathers embedding vectors corresponding to values in encodings. If embedding_matrix is passed,
then it will be used to initialize the embedding matrix. Otherwise, the matrix will be
initialized with random embeddings.
Args:
encodings: Tensor of shape [batch_size, sequence length].
shape: Shape of 2D parameter matrix. The first dimension should contain
the vocabulary size and the second dimension should be the size
of the embedding dimension.
embedding_matrix: numpy array of the embedding matrix.
Returns:
Sequence of embedding vectors. Tensor of shape [batch_size, sequence length, shape[1]].
"""
with tf.variable_scope(scope, reuse=reuse):
W = tf.get_variable(
name='weights',
initializer=embedding_matrix or tf.contrib.layers.variance_scaling_initializer(),
shape=shape
)
embeddings = tf.nn.embedding_lookup(W, encodings)
return embeddings
def dense_word_embedding_from_chars(chars, embed_dim, bias=True, scope='dense-word-embed', reuse=False):
"""
Word embeddings via dense transformation + maxpooling of character sequences.
Args:
chars: Tensor of shape [batch_size, word sequence length, char sequence length, alphabet size].
embed_dim: Dimension of word embeddings. Integer.
Returns:
Sequence of embedding vectors. Tensor of shape [batch_size, word sequence length, embed_dim].
"""
with tf.variable_scope(scope, reuse=reuse):
chars = tf.cast(chars, tf.float32)
W = tf.get_variable(
name='weights',
initializer=tf.contrib.layers.variance_scaling_initializer(),
shape=[shape(chars, -1), embed_dim]
)
z = tf.einsum('ijkl,lm->ijkm', chars, W)
if bias:
b = tf.get_variable(
name='biases',
initializer=tf.constant_initializer(),
shape=[embed_dim]
)
z = z + b
dense_word_embedding = tf.reduce_max(z, 2)
return dense_word_embedding
def lstm_word_embedding_from_chars(chars, lengths, embed_dim, scope='lstm-word-embed', reuse=False):
"""
Word embeddings via LSTM encoding of character sequences.
Args:
chars: Tensor of shape [batch_size, word sequence length, char sequence length, num characters].
lengths: Tensor of shape [batch_size, word_sequence length].
embed_dim: Dimension of word embeddings. Integer.
Returns:
Sequence of embedding vectors. Tensor of shape [batch_size, word sequence length, embed_dim].
"""
chars = tf.cast(chars, tf.float32)
# this is super inefficient
chars = tf.unstack(chars, axis=0)
lengths = tf.unstack(lengths, axis=0)
lstm_word_embeddings = []
for i, (char, length) in enumerate(zip(chars, lengths)):
temp_reuse = i != 0 or reuse
embedding = lstm_encoder(char, length, embed_dim, 1.0, scope=scope, reuse=temp_reuse)
lstm_word_embeddings.append(embedding)
lstm_word_embeddings = tf.stack(lstm_word_embeddings, axis=0)
return lstm_word_embeddings
def convolutional_word_embedding_from_chars(chars, embed_dim, convolution_width, bias=True,
scope='conv-word-embed', reuse=False):
"""
Word embeddings via convolution + maxpooling of character sequences.
Args:
chars: Tensor of shape [batch_size, word sequence length, char sequence length, alphabet size].
embed_dim: Dimension of word embeddings Integer.
convolution_width: Number of characters used in the convolution. Integer.
Returns:
Sequence of embedding vectors. Tensor of shape [batch_size, word sequence length, embed_dim].
"""
chars = tf.cast(chars, tf.float32)
# this is super inefficient
chars = tf.unstack(chars, axis=0)
conv_word_embeddings = []
for i, char in enumerate(chars):
temp_reuse = i != 0 or reuse
conv = temporal_convolution_layer(
char, embed_dim, convolution_width, scope=scope, reuse=temp_reuse)
embedding = tf.reduce_max(conv, axis=1)
conv_word_embeddings.append(embedding)
conv_word_embeddings = tf.stack(conv_word_embeddings, axis=0)
return conv_word_embeddings