class LinearModel(SHALOModelVectorMean, SHALOModelFixed): """Linear model over pretrained embeddings""" name = 'LinearModel' def _preprocess_data(self, sentence_data, init=True): # Initialize word table and populate with embeddings if init: self.word_dict = SymbolTable() for word in self.embedding_words: self.word_dict.get(word) # Process data return [ map_words_to_symbols(s, self.word_dict.lookup, self.ngrams) for s in sentence_data ]
class SHALOModelPreTrain(SHALOModel): name = 'SHALOModelPreTrain' def __init__(self, embedding_file, save_file=None, n_threads=None): SHALOModel.__init__(self, save_file, n_threads) with open(embedding_file, 'rb') as f: self.embedding_words, self.embeddings = cPickle.load(f) def _word_table_init(self, training_sentences): """Get training words and init word table with pre-embedded words""" self._get_training_words(training_sentences) self.word_dict = SymbolTable() for word in self.embedding_words_train: self.word_dict.get(word) def _get_training_words(self, training_sentences): """Get training words and subset of pre-embedded words in train set""" unique_words = set(w for s in training_sentences for w in s) embedding_idxs_train, self.embedding_words_train = [], [] for i, word in enumerate(self.embedding_words): if word in unique_words: self.embedding_words_train.append(word) embedding_idxs_train.append(i) idxs = np.ravel(embedding_idxs_train) self.embeddings_train = self.embeddings[idxs, :] def _get_embedding(self): """ Return embedding tensor (either constant or variable) Row 0 is 0 vector for no token Row 1 is random initialization for UNKNOWN Rows 2 : 2 + len(self.embedding_words) are pretrained initialization Remaining rows are random initialization """ zero = tf.constant(0.0, dtype=tf.float32, shape=(1, self.d)) s = self.seed - 1 unk = tf.Variable(tf.random_normal((1, self.d), stddev=SD, seed=s)) pretrain = tf.Variable(self.embeddings_train, dtype=tf.float32) vecs = [zero, unk, pretrain] n_r = self.word_dict.num_words() - len(self.embedding_words_train) if n_r > 0: r = tf.Variable(tf.random_normal((n_r, self.d), stddev=SD, seed=s)) vecs.append(r) self.U = tf.concat(vecs, axis=0, name='embedding_matrix') return self.U
class TTBB(SHALOModelFixed): """Implementation of A Simple but Tough-to-Beat Baseline for Sent. Embedding In the basic model, the common component vector is computed before all computations. The embeddings are static, so no updates are made. """ name = 'TTBB' def __init__(self, embedding_file, word_freq_file, save_file=None, n_threads=None): SHALOModelFixed.__init__(self, embedding_file, save_file, n_threads) # Get marginals file with open(word_freq_file, 'rb') as f: self.word_freq = cPickle.load(f) def _word_table_init(self, training_sentences): self.word_dict = SymbolTable() for word in self.embedding_words: self.word_dict.get(word) def _get_mapper(self, init): return self.word_dict.lookup def _preprocess_data(self, sentence_data, init=True): # Initialize word table and populate with embeddings if init: self._word_table_init(sentence_data) # Process data # Map tokens and return if not initializing mapper = self._get_mapper(init) tokens = [ np.ravel(map_words_to_symbols(s, mapper, self.ngrams)) for s in sentence_data ] self.train_tokens = tokens if not init: return tokens # If initializing, get marginal estimates self.marginals = np.zeros(self.word_dict.num_symbols()) for word, idx in self.word_dict.d.iteritems(): # Try getting word frequency directly if word in self.word_freq: self.marginals[idx] = self.word_freq[word] # Otherwise, try getting minimum frequency among sub-grams split_grams = word.split(GRAMSEP) if len(split_grams) > 1: min_freq = min(self.word_freq.get(w, 0.0) for w in split_grams) self.marginals[idx] = min_freq # Get initial smoother value self.a = self.train_kwargs.get('a', -3.0) return tokens def _compute_train_common_component(self, init=False): if init: self.session.run(tf.global_variables_initializer()) x_array, x_len = self._get_data_batch(self.train_tokens) self.ccx = self.session.run(self.tf_ccx, { self.input: x_array, self.input_lengths: x_len }) return self.ccx def _get_a_exp(self): return tf.constant(self.a, dtype=tf.float32) def _get_common_component(self): self.ccx = self._compute_train_common_component(init=True) return tf.constant(self.ccx, dtype=tf.float32) def _embed_sentences(self): """Tensorflow implementation of Simple but Tough-to-Beat Baseline""" # Get word features word_embeddings = self._get_embedding() word_feats = tf.nn.embedding_lookup(word_embeddings, self.input) # Get marginal estimates and scaling term batch_size = tf.shape(word_feats)[0] a = tf.pow(10.0, self._get_a_exp()) p = tf.constant(self.marginals, dtype=tf.float32, name='marginals') q = tf.reshape(a / (a + tf.nn.embedding_lookup(p, self.input)), (batch_size, self.mx_len, 1)) # Compute initial sentence embedding z = tf.reshape(1.0 / tf.to_float(self.input_lengths), (batch_size, 1)) S = z * tf.reduce_sum(q * word_feats, axis=1) # Compute common component S_centered = S - tf.reduce_mean(S, axis=0) _, _, V = tf.svd(S_centered, full_matrices=False, compute_uv=True) self.tf_ccx = tf.stop_gradient(tf.gather(tf.transpose(V), 0)) # Common component removal ccx = tf.reshape(self._get_common_component(), (1, self.d)) sv = {'embeddings': word_embeddings, 'a': a, 'p': p, 'ccx': ccx} return S - tf.matmul(S, ccx * tf.transpose(ccx)), sv